PageRenderTime 54ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/switchboard.py

https://github.com/BrucePHill/nltk
Python | 120 lines | 84 code | 21 blank | 15 comment | 13 complexity | 41e718e1c66ca7087bb8460f6a9835ea MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Switchboard Corpus Reader
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import unicode_literals
  8. import re
  9. from nltk.tag import str2tuple
  10. from nltk import compat
  11. from .util import *
  12. from .api import *
  13. @compat.python_2_unicode_compatible
  14. class SwitchboardTurn(list):
  15. """
  16. A specialized list object used to encode switchboard utterances.
  17. The elements of the list are the words in the utterance; and two
  18. attributes, ``speaker`` and ``id``, are provided to retrieve the
  19. spearker identifier and utterance id. Note that utterance ids
  20. are only unique within a given discourse.
  21. """
  22. def __init__(self, words, speaker, id):
  23. list.__init__(self, words)
  24. self.speaker = speaker
  25. self.id = int(id)
  26. def __repr__(self):
  27. if len(self) == 0:
  28. text = ''
  29. elif isinstance(self[0], tuple):
  30. text = ' '.join('%s/%s' % w for w in self)
  31. else:
  32. text = ' '.join(self)
  33. return '<%s.%s: %r>' % (self.speaker, self.id, text)
  34. class SwitchboardCorpusReader(CorpusReader):
  35. _FILES = ['tagged']
  36. # Use the "tagged" file even for non-tagged data methods, since
  37. # it's tokenized.
  38. def __init__(self, root, tag_mapping_function=None):
  39. CorpusReader.__init__(self, root, self._FILES)
  40. self._tag_mapping_function = tag_mapping_function
  41. def words(self):
  42. return StreamBackedCorpusView(self.abspath('tagged'),
  43. self._words_block_reader)
  44. def tagged_words(self, simplify_tags=False):
  45. def tagged_words_block_reader(stream):
  46. return self._tagged_words_block_reader(stream, simplify_tags)
  47. return StreamBackedCorpusView(self.abspath('tagged'),
  48. tagged_words_block_reader)
  49. def turns(self):
  50. return StreamBackedCorpusView(self.abspath('tagged'),
  51. self._turns_block_reader)
  52. def tagged_turns(self, simplify_tags=False):
  53. def tagged_turns_block_reader(stream):
  54. return self._tagged_turns_block_reader(stream, simplify_tags)
  55. return StreamBackedCorpusView(self.abspath('tagged'),
  56. tagged_turns_block_reader)
  57. def discourses(self):
  58. return StreamBackedCorpusView(self.abspath('tagged'),
  59. self._discourses_block_reader)
  60. def tagged_discourses(self, simplify_tags=False):
  61. def tagged_discourses_block_reader(stream):
  62. return self._tagged_discourses_block_reader(stream, simplify_tags)
  63. return StreamBackedCorpusView(self.abspath('tagged'),
  64. tagged_discourses_block_reader)
  65. def _discourses_block_reader(self, stream):
  66. # returns at most 1 discourse. (The other methods depend on this.)
  67. return [[self._parse_utterance(u, include_tag=False)
  68. for b in read_blankline_block(stream)
  69. for u in b.split('\n') if u.strip()]]
  70. def _tagged_discourses_block_reader(self, stream, simplify_tags=False):
  71. # returns at most 1 discourse. (The other methods depend on this.)
  72. return [[self._parse_utterance(u, include_tag=True,
  73. simplify_tags=simplify_tags)
  74. for b in read_blankline_block(stream)
  75. for u in b.split('\n') if u.strip()]]
  76. def _turns_block_reader(self, stream):
  77. return self._discourses_block_reader(stream)[0]
  78. def _tagged_turns_block_reader(self, stream, simplify_tags=False):
  79. return self._tagged_discourses_block_reader(stream, simplify_tags)[0]
  80. def _words_block_reader(self, stream):
  81. return sum(self._discourses_block_reader(stream)[0], [])
  82. def _tagged_words_block_reader(self, stream, simplify_tags=False):
  83. return sum(self._tagged_discourses_block_reader(stream,
  84. simplify_tags)[0], [])
  85. _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
  86. _SEP = '/'
  87. def _parse_utterance(self, utterance, include_tag, simplify_tags=False):
  88. m = self._UTTERANCE_RE.match(utterance)
  89. if m is None:
  90. raise ValueError('Bad utterance %r' % utterance)
  91. speaker, id, text = m.groups()
  92. words = [str2tuple(s, self._SEP) for s in text.split()]
  93. if not include_tag:
  94. words = [w for (w,t) in words]
  95. elif simplify_tags:
  96. words = [(w, self._tag_mapping_function(t)) for (w,t) in words]
  97. return SwitchboardTurn(words, speaker, id)