PageRenderTime 45ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/misc/babelfish.py

https://github.com/haewoon/nltk
Python | 193 lines | 169 code | 2 blank | 22 comment | 5 complexity | e5276c4ab3e878c1291899c968e47a1c MD5 | raw file
Possible License(s): Apache-2.0
  1. # coding: utf8
  2. # babelizer.py - API for simple access to babelfish.altavista.com.
  3. # Requires python 2.0 or better.
  4. # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/64937
  5. # Author: Jonathan Feinberg <jdf@pobox.com>
  6. # Modified by Steven Bird to work with current babelfish
  7. #
  8. # See it in use at http://babel.MrFeinberg.com/
  9. r"""API for simple access to babelfish.altavista.com.
  10. Summary:
  11. >>> from nltk.misc import babelfish as babelizer
  12. >>> babelizer.available_languages
  13. ['Chinese', 'English', 'French', 'German', 'Greek', 'Italian', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish']
  14. >>> babelizer.translate('How much is that doggie in the window?',
  15. ... 'english', 'french')
  16. 'Combien co\xfbte ce chienchien dans la fen\xeatre ?'
  17. """
  18. import re
  19. import string
  20. import urllib
  21. import sys
  22. """
  23. Various patterns I have encountered in looking for the babelfish result.
  24. We try each of them in turn, based on the relative number of times I've
  25. seen each of these patterns. $1.00 to anyone who can provide a heuristic
  26. for knowing which one to use. This includes AltaVista employees.
  27. """
  28. __where = [ re.compile(r'<div id="result"><div style="padding:0.6em;">([^<]*)'),
  29. re.compile(r'name=\"q\">([^<]*)'),
  30. re.compile(r'td bgcolor=white>([^<]*)'),
  31. re.compile(r'<\/strong><br>([^<]*)'),
  32. re.compile(r'padding:10px[^>]+>([^<]*)')
  33. ]
  34. __languages = { 'english' : 'en',
  35. 'french' : 'fr',
  36. 'spanish' : 'es',
  37. 'german' : 'de',
  38. 'greek' : 'el',
  39. 'italian' : 'it',
  40. 'portuguese': 'pt',
  41. 'chinese' : 'zh',
  42. 'japanese' : 'ja',
  43. 'korean' : 'ko',
  44. 'russian' : 'ru'
  45. }
  46. """
  47. All of the available language names.
  48. """
  49. available_languages = sorted([x.title() for x in __languages])
  50. class BabelizerError(Exception):
  51. """
  52. Calling translate() or babelize() can raise a BabelizerError
  53. """
  54. class BabelfishChangedError(BabelizerError):
  55. """
  56. Thrown when babelfish.yahoo.com changes some detail of their HTML layout,
  57. and babelizer no longer submits data in the correct form, or can no
  58. longer parse the results.
  59. """
  60. class BabelizerIOError(BabelizerError):
  61. """
  62. Thrown for various networking and IO errors.
  63. """
  64. def clean(text):
  65. return re.sub(r'\s+', ' ', text.strip())
  66. def translate(phrase, source, target):
  67. """
  68. Use babelfish to translate phrase from source language to target language.
  69. It's only guaranteed to work if 'english' is one of the two languages.
  70. :raise BabelizeError: If an error is encountered.
  71. """
  72. phrase = clean(phrase)
  73. try:
  74. source_code = __languages[source]
  75. target_code = __languages[target]
  76. except KeyError, lang:
  77. raise ValueError, "Language %s not available" % lang
  78. params = urllib.urlencode({'doit': 'done',
  79. 'tt': 'urltext',
  80. 'urltext': phrase,
  81. 'lp': source_code + '_' + target_code})
  82. try:
  83. response = urllib.urlopen('http://babelfish.yahoo.com/translate_txt', params)
  84. except IOError, what:
  85. raise BabelizerIOError("Couldn't talk to server: %s" % what)
  86. html = response.read()
  87. for regex in __where:
  88. match = regex.search(html)
  89. if match: break
  90. if not match: raise BabelfishChangedError("Can't recognize translated string.")
  91. return clean(match.group(1))
  92. def babelize(phrase, source, target, limit = 12):
  93. """
  94. Use babelfish to translate back and forth between source and
  95. target until either no more changes occur in translation or
  96. limit iterations have been reached, whichever comes first.
  97. It's only guaranteed to work if 'english' is one of the two
  98. languages.
  99. :raise BabelizeError: If an error is encountered.
  100. """
  101. phrase = clean(phrase)
  102. seen = set([phrase])
  103. yield phrase
  104. flip = {source: target, target: source}
  105. next = source
  106. for i in range(limit):
  107. phrase = translate(phrase, next, flip[next])
  108. if phrase in seen:
  109. break
  110. seen.add(phrase)
  111. yield phrase
  112. next = flip[next]
  113. HELP = """NLTK Babelizer Commands:
  114. All single-word inputs are commands:
  115. help: this help message
  116. languages: print the list of languages
  117. language: the name of a language to use"""
  118. def babelize_shell():
  119. """
  120. An interactive shell that uses babelfish to
  121. translate back and forth between source and
  122. target until either no more changes occur in translation or
  123. limit iterations have been reached, whichever comes first.
  124. It's only guaranteed to work if 'english' is one of the two
  125. languages.
  126. :raise BabelizeError: If an error is encountered.
  127. """
  128. print "NLTK Babelizer: type 'help' for a list of commands."
  129. language = ''
  130. phrase = ''
  131. try:
  132. while True:
  133. command = raw_input('Babel> ')
  134. command = clean(command)
  135. if ' ' not in command:
  136. command = command.lower()
  137. if command == 'help':
  138. print HELP
  139. elif command == 'languages':
  140. print ' '.join(sorted(__languages))
  141. elif command in __languages:
  142. language = command
  143. elif command in ['quit', 'bye', 'end']:
  144. break
  145. elif command == 'run':
  146. if not language:
  147. print "Please specify a language first (type 'languages' for a list)."
  148. elif not phrase:
  149. print "Please enter a phrase first (just type it in at the prompt)."
  150. else:
  151. for count, new_phrase in enumerate(babelize(phrase, 'english', language)):
  152. print "%s>" % count, new_phrase
  153. sys.stdout.flush()
  154. else:
  155. print "Command not recognized (type 'help' for help)."
  156. # if the command contains a space, it must have multiple words, and be a new phrase
  157. else:
  158. phrase = command
  159. except EOFError:
  160. print
  161. pass
  162. # I won't take that from you, or from your doggie (Korean)
  163. # the pig I found looked happy (chinese)
  164. # absence makes the heart grow fonder (italian)
  165. # more idioms: http://www.idiomsite.com/
  166. if __name__ == '__main__':
  167. babelize_shell()