PageRenderTime 50ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk-old/contrib/nltk_contrib/unimelb/460/2003/cawilson/wordTranslator.py

http://nltk.googlecode.com/
Python | 418 lines | 407 code | 0 blank | 11 comment | 0 complexity | 4fde03b02c033813d09a40e4dd5cd480 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # 433-460 Project
  2. # Author: Charlotte Wilson (cawilson)
  3. # Date: October 2003
  4. #
  5. # Word Translation and Sense-Tag Generation Module-
  6. """
  7. This module contains the wordTranslator class which works out the possible
  8. translations of a word which we aim to disambiguate (called from the
  9. senseDisambiguator module). This uses a sentence-aligned bilingual training
  10. corpus with one sentence per line to train a statistical translation method.
  11. The statistcal method is based on contingency tables and phi squared scores
  12. as described in William A. Gale and Kenneth Ward Church (1991), "Identifying
  13. Word Correspondences in Parallel Texts" Proceedings of the DARPA SNL Workshop.
  14. This module also contains methods to return the word's translation in a
  15. given sentence and to group these possible translations into "sense groups"
  16. according to common morphology.
  17. """
  18. from nltk.probability import FreqDist
  19. from nltk.tokenizer import WSTokenizer
  20. from __future__ import division
  21. import re
  22. ##############################################################################
  23. class WordTranslator:
  24. """
  25. A class to work out the translation of an English word in a given
  26. German sentence and to group translations with a common morpheme
  27. into a single "sense group".
  28. The translation is performed using a statistical method - phi
  29. squared - (as described below) and is trained on a one-to-one
  30. sentence aligned bilingual corpus with one sentence per line.
  31. Translations can then be grouped by common morphology into (rather
  32. fuzzy) sense groupings. This allows for translation morphemes occuring
  33. in compound words and is unnecessary if the corpus were stemmed
  34. (including compound splitting) before training.
  35. For further discussion of these issues see the associated report.
  36. """
  37. def __init__(self):
  38. """
  39. Constructor function for WordTranslator. Create a frequency
  40. distribution for the ambiguous word, its possible translations
  41. and their co-occurences. Inititalize class variables -
  42. the ambiguous word, the total number of aligned sentences,
  43. a list of possible translations and mappings of those
  44. translations to sense groupings.
  45. """
  46. self._cooccur_fdist = FreqDist()
  47. self._ambig_word_fdist = FreqDist()
  48. self._trans_fdist = FreqDist()
  49. self._ambig_word = None
  50. self._num_sents = 0
  51. self._poss_translations = None
  52. self._sense_dict = None
  53. self._sense_list = None
  54. def train(self, ambig_word, de_en_corpus, numfiles=0):
  55. """
  56. Train the word translator for the given ambiguous word on
  57. the corresponding corpus files. If numfiles is 0 train
  58. on the whole corpus, otherwise train on either the number
  59. of files specified or the whole corpus, whichever is smaller.
  60. """
  61. # set the ambiguous word to train on
  62. self._ambig_word = ambig_word
  63. # train the word translator for the given word on the
  64. # parallel corpus
  65. if numfiles == 0 or numfiles > len(de_en_corpus.items()):
  66. numfiles = len(de_en_corpus.items())
  67. for i in range(numfiles):
  68. english_file = de_en_corpus.items()[i][0]
  69. german_file = de_en_corpus.items()[i][1]
  70. self.train_file(de_en_corpus.path(english_file),
  71. de_en_corpus.path(german_file))
  72. return
  73. def train_file(self, english_filename, german_filename):
  74. """
  75. Given corresponding corpus files - 1 per language,
  76. train the word translator on them. Parallel corpus files
  77. are assumed to be one-to-one sentence aligned and to contain
  78. one sentence per line.
  79. Perform some preprocessing such as the removal of sentence
  80. intital capitalisation, sentence tokenization and removal of
  81. document titles. German nouns and proper nouns are
  82. capitalised and thus we don't want convert the whole sentence
  83. to lower case as we would lose important contextual
  84. information.
  85. Increment frequency counts for both co-occurence and individual
  86. frequecies. i.e. counts of the number of times the ambiguous
  87. English word occurs, the number of times every German word
  88. occurs and the number of times the a German word co-occurs
  89. in an aligned sentence with the ambiguous English word.
  90. Note: sentence punctuation is not included in these frequency
  91. counts.
  92. """
  93. #our corpus has one sentence per line so read line by line
  94. english_sents = open(english_filename).readlines()
  95. german_sents = open(german_filename).readlines()
  96. # increment the sentence count
  97. num_sents = len(english_sents)
  98. self._num_sents = self._num_sents + num_sents
  99. # remove sentence initial capitalisation - German nouns
  100. # begin with a capital letter, therefore just lower the
  101. # first sentence character
  102. for sent_num in xrange(num_sents):
  103. english_sent = english_sents[sent_num]
  104. english_sent = english_sent[0].lower() + \
  105. english_sent[1:]
  106. german_sent = german_sents[sent_num]
  107. german_sent = german_sent[0].lower() + german_sent[1:]
  108. # tokenize the sentences
  109. english_tokens = WSTokenizer().tokenize(english_sent)
  110. german_tokens = WSTokenizer().tokenize(german_sent)
  111. # discard if the line contains a doc title
  112. if len(english_tokens) > 0 and len(german_tokens) > 0:
  113. doc_regexp = re.compile(r'<DOC')
  114. if doc_regexp.match(english_tokens[0].type()) \
  115. and doc_regexp.match(
  116. german_tokens[0].type()):
  117. continue
  118. # increment the frequency counts - both co-occurence
  119. # frequency and individual frequencies
  120. contains_ambig_word = False
  121. # find the ambiguous word in the English sentence
  122. # and increment its frequency count
  123. for english_token in english_tokens:
  124. if english_token.type() == self._ambig_word:
  125. self._ambig_word_fdist.inc(
  126. self._ambig_word)
  127. contains_ambig_word = True
  128. break
  129. # increment the frequency count for each German word
  130. in_sent = {}
  131. for german_token in german_tokens:
  132. german_word = german_token.type()
  133. if not in_sent.has_key(german_word) and \
  134. self.not_punctuation(german_word):
  135. self._trans_fdist.inc(german_word)
  136. in_sent[german_word] = True
  137. # increment the co-occurence frequency
  138. # count
  139. if contains_ambig_word:
  140. self._cooccur_fdist.inc(
  141. german_word)
  142. return
  143. def not_punctuation(self, word):
  144. """
  145. Returns True if the given word is not punctuation and False
  146. otherwise.
  147. """
  148. # uses a regular expression for punctuation as any number of
  149. # non-word characters
  150. punct_regexp = re.compile(r'[^a-zA-Z0-9]*$')
  151. if punct_regexp.match(word):
  152. return False
  153. else:
  154. return True
  155. def get_trans_score_pairs(self):
  156. """
  157. Get a list of translation, score tuples, sorted from most
  158. likely to least likely. Every possible tranlsation of the
  159. English word (i.e. every German word that occurs in a
  160. sentence aligned with an English sentence containing the
  161. English word) is given a score between 0 and 1.
  162. This score is a measure of the dependence between the
  163. ambiguous word and the possible translation. The score is
  164. calculated from the values in a 2-by-2 contingency table
  165. (a table containing counts of the word cooccurence and
  166. word non-cooccurence). The score (called phi squared) is a
  167. variant of the chi squared score, commonly
  168. used in the identification of collocations. Phi squared
  169. is described in William A. Gale and Kenneth Ward Church
  170. (1991). "Identifying Word Correspondences in Parallel Texts"
  171. Proceedings of the DARPA SNL Workshop.
  172. See the assosciated report for an further explanation of
  173. contingency tables and phi squared.
  174. The list of possible translations is then sorted from most
  175. likely to least likely according to phi squared score. The
  176. higher the phi-squared score the more dependent the two words
  177. are and the more likely the German word is to be a translation
  178. of the English word.
  179. """
  180. # for every possible translation of the ambiguous word
  181. # construct a contingency table and calculate the phi
  182. # squared score
  183. samples = []
  184. for sample in self._cooccur_fdist.samples():
  185. # calculate the contingency table values
  186. a = self._cooccur_fdist.count(sample)
  187. b = self._ambig_word_fdist.count(self._ambig_word) - a
  188. c = self._trans_fdist.count(sample) - a
  189. d = self._num_sents - a - b - c
  190. # do the maths
  191. score = self.calc_score(a, b, c, d)
  192. samples.append((score, sample))
  193. # sort from most likely to least likely translation
  194. samples.sort()
  195. samples.reverse()
  196. return samples
  197. def calc_score(self, a, b, c, d):
  198. """
  199. Given the values of a 2-by-2 contingency table, calculate
  200. the phi squared score as per Church and Gale's formula (see
  201. reference in description of get_trans_score_pairs() above).
  202. a = no. sents containing both the German and English words
  203. b = no. sents containing the English but not the German word
  204. c = no. sents containing the German but not the English word
  205. d = no. sents containing neither the English or German word
  206. See the assosciated report for an further explanation of
  207. contingency tables and phi squared.
  208. """
  209. top = (a * d - b * c)**2
  210. bottom = (a + b)*(a + c)*(b + d)*(c + d)
  211. score = top / bottom
  212. return score
  213. def get_translations(self):
  214. """
  215. Get a list of the ambiguous word's possible translations
  216. sorted from most likely to least likely.
  217. """
  218. return [trans for (score, trans) in
  219. self.get_trans_score_pairs()]
  220. def get_translation(self, german_sentence):
  221. """
  222. Get the translation of the ambiguous word in the given German
  223. sentence. We take this to be the highest ranked possible
  224. translation that occurs in the German sentence.
  225. """
  226. # tokenize the german sentence
  227. german_tokens = WSTokenizer().tokenize(german_sentence)
  228. german_types = [token.type() for token in german_tokens]
  229. # get the list of possible translations sorted from most
  230. #likely to least likely if we haven't already
  231. if self._poss_translations == None:
  232. self._poss_translations = self.get_translations()
  233. for poss_trans in self._poss_translations:
  234. # take the most likely translation that occurs in our
  235. # german sentence
  236. if german_types.count(poss_trans) != 0:
  237. return poss_trans
  238. return None
  239. def get_sense_tag(self, german_sentence, num_senses):
  240. """
  241. Get the sense-tag of the ambiguous word in the given German
  242. sentence containing its translation. German translations
  243. are grouped into "sense groups" because many translations are
  244. compound nouns containing morphemes that are translations not
  245. only of the ambiguous English word, but also of other word
  246. or concepts (not to metion grammatical morphology such as
  247. plural endings etc.).
  248. For example: "Zins" is a direct translation of "interest",
  249. however it occurs translated in German corpora within
  250. compounds such as "Zinserhoehung" or "Zinsniveau".
  251. Sense-grouping aims to collapse such compounds into the one
  252. "sense group" corresponding to "Zins".
  253. If num_senses is greater than 1, then we take the most-likely
  254. num_senses senses and return the sense-tag of the sense
  255. group that the direct translation fits into. If the word's
  256. direct translation does no occur in a sense group, we return
  257. None.
  258. If num_senses is 0 then we do not attempt to group the
  259. translations into sense groups. Instead we return the direct
  260. translation.
  261. """
  262. # get the most likely translation
  263. translation = self.get_translation(german_sentence)
  264. if translation == None:
  265. return None
  266. # do not perform sense-grouping- return the direct translation
  267. if num_senses ==0:
  268. return translation
  269. # create the sense dictionary if we haven't already
  270. if self._sense_dict == None:
  271. self._sense_dict, self._sense_list = self.sense_group(
  272. self._poss_translations, num_senses)
  273. # get the translation's corresponding sense tag
  274. if self._sense_dict.has_key(translation):
  275. return self._sense_dict[translation]
  276. else:
  277. for sense in self._sense_list:
  278. low_trans = translation.lower()
  279. if low_trans.find(sense) != -1:
  280. return sense
  281. return None
  282. def sense_group(self, trans_list, num_senses):
  283. """
  284. Group the translations into "sense groups" according to common
  285. morphology. The grouping is done by searching for common
  286. substrings in the translations. We take a 4 letter prefix
  287. from each the top 30 possible translations and search for
  288. this prefix within the other translations. The idea is that the
  289. 4-letter prefix contains information that will help to locate
  290. common morphemes that occur in the translations.
  291. Of course this is not foolproof as some morphemes may not
  292. occur as prefixes and 4 letters may not distinguish some
  293. morphemes from one another - see the associated report for
  294. a broader discussion of this strategy.
  295. For each of the resulting "sense groups" the phi squared score
  296. (discussed above) is recalculated for the group and the
  297. groups are then sorted by these scores. The specified number
  298. of highest ranked senses (given by num_senses) is then taken
  299. from the sense groups.
  300. Returns a set of translation /sense mappings and a list of the
  301. valid senses (by prefix).
  302. """
  303. # take the first 30 possible translations and search for
  304. # prefixes that form common substrings
  305. senses = {}
  306. trans_len = 30
  307. for i in range(len(trans_list[:trans_len])):
  308. trans = trans_list[i]
  309. prefix = trans[:4].lower()
  310. if not senses.has_key(prefix):
  311. senses[prefix] = [trans]
  312. for trans1 in trans_list[i:trans_len] + trans_list[:i]:
  313. substring_index = trans1.lower().find(prefix)
  314. if substring_index != -1:
  315. if senses[prefix].count(trans1) == 0:
  316. senses[prefix].append(trans1)
  317. # recaluate the phi squared score for each sense group
  318. scored_senses = []
  319. for sense in senses.keys():
  320. total_a = 0
  321. total_c = 0
  322. translations = senses[sense]
  323. for trans in translations:
  324. a = self._cooccur_fdist.count(trans)
  325. c = self._trans_fdist.count(trans) - a
  326. total_a += a
  327. total_c += c
  328. b = self._ambig_word_fdist.count(self._ambig_word) \
  329. - total_a
  330. d = self._num_sents - total_a - b - total_c
  331. new_score = self.calc_score(total_a, b, total_c, d)
  332. # a dictionary of sense/ score pairs
  333. scored_senses.append((new_score, sense))
  334. scored_senses.sort()
  335. scored_senses.reverse()
  336. # take the specified number of highest ranked senses
  337. valid_senses = [sense for (score, sense) in
  338. scored_senses[:num_senses]]
  339. # reverse the list of senses, so that we take the most
  340. # likely sense for the translation
  341. valid_senses.reverse()
  342. # construct a translation, sense mapping for the valid senses
  343. sense_dict = {}
  344. for valid_sense in valid_senses:
  345. t_list = senses[valid_sense]
  346. for trans in t_list:
  347. sense_dict[trans] = valid_sense
  348. return sense_dict, valid_senses