PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/Lib/site-packages/nltk/translate/api.py

https://gitlab.com/pierreEffiScience/TwitterClustering
Python | 321 lines | 304 code | 3 blank | 14 comment | 0 complexity | 836edf93883d946c42c17fb3d1cc8fe3 MD5 | raw file
  1. # Natural Language Toolkit: API for alignment and translation objects
  2. #
  3. # Copyright (C) 2001-2016 NLTK Project
  4. # Author: Will Zhang <wilzzha@gmail.com>
  5. # Guan Gui <ggui@student.unimelb.edu.au>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # Tah Wei Hoon <hoon.tw@gmail.com>
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from __future__ import print_function, unicode_literals
  11. import subprocess
  12. from collections import namedtuple
  13. from nltk.compat import python_2_unicode_compatible, string_types
  14. @python_2_unicode_compatible
  15. class AlignedSent(object):
  16. """
  17. Return an aligned sentence object, which encapsulates two sentences
  18. along with an ``Alignment`` between them.
  19. >>> from nltk.translate import AlignedSent, Alignment
  20. >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
  21. ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-2 1-3 2-1 3-0'))
  22. >>> algnsent.words
  23. ['klein', 'ist', 'das', 'Haus']
  24. >>> algnsent.mots
  25. ['the', 'house', 'is', 'small']
  26. >>> algnsent.alignment
  27. Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
  28. >>> from nltk.corpus import comtrans
  29. >>> print(comtrans.aligned_sents()[54])
  30. <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
  31. >>> print(comtrans.aligned_sents()[54].alignment)
  32. 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
  33. :param words: source language words
  34. :type words: list(str)
  35. :param mots: target language words
  36. :type mots: list(str)
  37. :param alignment: the word-level alignments between the source
  38. and target language
  39. :type alignment: Alignment
  40. """
  41. def __init__(self, words, mots, alignment=None):
  42. self._words = words
  43. self._mots = mots
  44. if alignment is None:
  45. self.alignment = Alignment([])
  46. else:
  47. assert type(alignment) is Alignment
  48. self.alignment = alignment
  49. @property
  50. def words(self):
  51. return self._words
  52. @property
  53. def mots(self):
  54. return self._mots
  55. def _get_alignment(self):
  56. return self._alignment
  57. def _set_alignment(self, alignment):
  58. _check_alignment(len(self.words), len(self.mots), alignment)
  59. self._alignment = alignment
  60. alignment = property(_get_alignment, _set_alignment)
  61. def __repr__(self):
  62. """
  63. Return a string representation for this ``AlignedSent``.
  64. :rtype: str
  65. """
  66. words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
  67. mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
  68. return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment)
  69. def _to_dot(self):
  70. """
  71. Dot representation of the aligned sentence
  72. """
  73. s = 'graph align {\n'
  74. s += 'node[shape=plaintext]\n'
  75. # Declare node
  76. for w in self._words:
  77. s += '"%s_source" [label="%s"] \n' % (w, w)
  78. for w in self._mots:
  79. s += '"%s_target" [label="%s"] \n' % (w, w)
  80. # Alignment
  81. for u,v in self._alignment:
  82. s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
  83. # Connect the source words
  84. for i in range(len(self._words)-1) :
  85. s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
  86. # Connect the target words
  87. for i in range(len(self._mots)-1) :
  88. s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
  89. # Put it in the same rank
  90. s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
  91. s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
  92. s += '}'
  93. return s
  94. def _repr_svg_(self):
  95. """
  96. Ipython magic : show SVG representation of this ``AlignedSent``.
  97. """
  98. dot_string = self._to_dot().encode('utf8')
  99. output_format = 'svg'
  100. try:
  101. process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE,
  102. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  103. except OSError:
  104. raise Exception('Cannot find the dot binary from Graphviz package')
  105. out, err = process.communicate(dot_string)
  106. return out.decode('utf8')
  107. def __str__(self):
  108. """
  109. Return a human-readable string representation for this ``AlignedSent``.
  110. :rtype: str
  111. """
  112. source = " ".join(self._words)[:20] + "..."
  113. target = " ".join(self._mots)[:20] + "..."
  114. return "<AlignedSent: '%s' -> '%s'>" % (source, target)
  115. def invert(self):
  116. """
  117. Return the aligned sentence pair, reversing the directionality
  118. :rtype: AlignedSent
  119. """
  120. return AlignedSent(self._mots, self._words,
  121. self._alignment.invert())
  122. @python_2_unicode_compatible
  123. class Alignment(frozenset):
  124. """
  125. A storage class for representing alignment between two sequences, s1, s2.
  126. In general, an alignment is a set of tuples of the form (i, j, ...)
  127. representing an alignment between the i-th element of s1 and the
  128. j-th element of s2. Tuples are extensible (they might contain
  129. additional data, such as a boolean to indicate sure vs possible alignments).
  130. >>> from nltk.translate import Alignment
  131. >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
  132. >>> a.invert()
  133. Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
  134. >>> print(a.invert())
  135. 0-0 1-0 2-1 2-2
  136. >>> a[0]
  137. [(0, 1), (0, 0)]
  138. >>> a.invert()[2]
  139. [(2, 1), (2, 2)]
  140. >>> b = Alignment([(0, 0), (0, 1)])
  141. >>> b.issubset(a)
  142. True
  143. >>> c = Alignment.fromstring('0-0 0-1')
  144. >>> b == c
  145. True
  146. """
  147. def __new__(cls, pairs):
  148. self = frozenset.__new__(cls, pairs)
  149. self._len = (max(p[0] for p in self) if self != frozenset([]) else 0)
  150. self._index = None
  151. return self
  152. @classmethod
  153. def fromstring(cls, s):
  154. """
  155. Read a giza-formatted string and return an Alignment object.
  156. >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
  157. Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
  158. :type s: str
  159. :param s: the positional alignments in giza format
  160. :rtype: Alignment
  161. :return: An Alignment object corresponding to the string representation ``s``.
  162. """
  163. return Alignment([_giza2pair(a) for a in s.split()])
  164. def __getitem__(self, key):
  165. """
  166. Look up the alignments that map from a given index or slice.
  167. """
  168. if not self._index:
  169. self._build_index()
  170. return self._index.__getitem__(key)
  171. def invert(self):
  172. """
  173. Return an Alignment object, being the inverted mapping.
  174. """
  175. return Alignment(((p[1], p[0]) + p[2:]) for p in self)
  176. def range(self, positions=None):
  177. """
  178. Work out the range of the mapping from the given positions.
  179. If no positions are specified, compute the range of the entire mapping.
  180. """
  181. image = set()
  182. if not self._index:
  183. self._build_index()
  184. if not positions:
  185. positions = list(range(len(self._index)))
  186. for p in positions:
  187. image.update(f for _,f in self._index[p])
  188. return sorted(image)
  189. def __repr__(self):
  190. """
  191. Produce a Giza-formatted string representing the alignment.
  192. """
  193. return "Alignment(%r)" % sorted(self)
  194. def __str__(self):
  195. """
  196. Produce a Giza-formatted string representing the alignment.
  197. """
  198. return " ".join("%d-%d" % p[:2] for p in sorted(self))
  199. def _build_index(self):
  200. """
  201. Build a list self._index such that self._index[i] is a list
  202. of the alignments originating from word i.
  203. """
  204. self._index = [[] for _ in range(self._len + 1)]
  205. for p in self:
  206. self._index[p[0]].append(p)
  207. def _giza2pair(pair_string):
  208. i, j = pair_string.split("-")
  209. return int(i), int(j)
  210. def _naacl2pair(pair_string):
  211. i, j, p = pair_string.split("-")
  212. return int(i), int(j)
  213. def _check_alignment(num_words, num_mots, alignment):
  214. """
  215. Check whether the alignments are legal.
  216. :param num_words: the number of source language words
  217. :type num_words: int
  218. :param num_mots: the number of target language words
  219. :type num_mots: int
  220. :param alignment: alignment to be checked
  221. :type alignment: Alignment
  222. :raise IndexError: if alignment falls outside the sentence
  223. """
  224. assert type(alignment) is Alignment
  225. if not all(0 <= pair[0] < num_words for pair in alignment):
  226. raise IndexError("Alignment is outside boundary of words")
  227. if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
  228. raise IndexError("Alignment is outside boundary of mots")
  229. PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
  230. class PhraseTable(object):
  231. """
  232. In-memory store of translations for a given phrase, and the log
  233. probability of the those translations
  234. """
  235. def __init__(self):
  236. self.src_phrases = dict()
  237. def translations_for(self, src_phrase):
  238. """
  239. Get the translations for a source language phrase
  240. :param src_phrase: Source language phrase of interest
  241. :type src_phrase: tuple(str)
  242. :return: A list of target language phrases that are translations
  243. of ``src_phrase``, ordered in decreasing order of
  244. likelihood. Each list element is a tuple of the target
  245. phrase and its log probability.
  246. :rtype: list(PhraseTableEntry)
  247. """
  248. return self.src_phrases[src_phrase]
  249. def add(self, src_phrase, trg_phrase, log_prob):
  250. """
  251. :type src_phrase: tuple(str)
  252. :type trg_phrase: tuple(str)
  253. :param log_prob: Log probability that given ``src_phrase``,
  254. ``trg_phrase`` is its translation
  255. :type log_prob: float
  256. """
  257. entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
  258. if src_phrase not in self.src_phrases:
  259. self.src_phrases[src_phrase] = []
  260. self.src_phrases[src_phrase].append(entry)
  261. self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob,
  262. reverse=True)
  263. def __contains__(self, src_phrase):
  264. return src_phrase in self.src_phrases