PageRenderTime 59ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/resources/nltk/app/collocations_app.py

https://github.com/rplevy/clojure-nltk
Python | 339 lines | 277 code | 50 blank | 12 comment | 19 complexity | 3e25e091dbcbc9a3f2ae1c249b85df16 MD5 | raw file
  1. # Natural Language Toolkit: Collocations Application
  2. # Much of the GUI code is imported from concordance.py; We intend to merge these tools together
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. #
  8. import threading
  9. import tkFont
  10. from Tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
  11. OptionMenu, SUNKEN, Scrollbar, StringVar,
  12. Text, Tk)
  13. from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
  14. indian, floresta, mac_morpho, machado, cess_esp)
  15. from nltk.util import in_idle
  16. from nltk.probability import FreqDist
  17. from nltk.text import Text as TextDomain
  18. CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
  19. ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
  20. _DEFAULT = 'English: Brown Corpus (Humor)'
  21. _CORPORA = {
  22. 'Catalan: CESS-CAT Corpus':
  23. lambda: cess_cat.words(),
  24. 'English: Brown Corpus':
  25. lambda: brown.words(),
  26. 'English: Brown Corpus (Press)':
  27. lambda: brown.words(categories=['news', 'editorial', 'reviews']),
  28. 'English: Brown Corpus (Religion)':
  29. lambda: brown.words(categories='religion'),
  30. 'English: Brown Corpus (Learned)':
  31. lambda: brown.words(categories='learned'),
  32. 'English: Brown Corpus (Science Fiction)':
  33. lambda: brown.words(categories='science_fiction'),
  34. 'English: Brown Corpus (Romance)':
  35. lambda: brown.words(categories='romance'),
  36. 'English: Brown Corpus (Humor)':
  37. lambda: brown.words(categories='humor'),
  38. 'English: NPS Chat Corpus':
  39. lambda: nps_chat.words(),
  40. 'English: Wall Street Journal Corpus':
  41. lambda: treebank.words(),
  42. 'Chinese: Sinica Corpus':
  43. lambda: sinica_treebank.words(),
  44. 'Dutch: Alpino Corpus':
  45. lambda: alpino.words(),
  46. 'Hindi: Indian Languages Corpus':
  47. lambda: indian.words(files='hindi.pos'),
  48. 'Portuguese: Floresta Corpus (Portugal)':
  49. lambda: floresta.words(),
  50. 'Portuguese: MAC-MORPHO Corpus (Brazil)':
  51. lambda: mac_morpho.words(),
  52. 'Portuguese: Machado Corpus (Brazil)':
  53. lambda: machado.words(),
  54. 'Spanish: CESS-ESP Corpus':
  55. lambda: cess_esp.words()
  56. }
  57. class CollocationsView:
  58. _BACKGROUND_COLOUR='#FFF' #white
  59. def __init__(self):
  60. self.model = CollocationsModel()
  61. self.model.add_listener(self)
  62. self.top = Tk()
  63. self._init_top(self.top)
  64. self._init_menubar()
  65. self._init_widgets(self.top)
  66. self._bind_event_handlers()
  67. self.load_corpus(self.model.DEFAULT_CORPUS)
  68. def _init_top(self, top):
  69. top.geometry('550x650+50+50')
  70. top.title('NLTK Collocations List')
  71. top.bind('<Control-q>', self.destroy)
  72. top.minsize(550,650)
  73. def _init_widgets(self, parent):
  74. self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
  75. self._init_corpus_select(self.main_frame)
  76. self._init_results_box(self.main_frame)
  77. self._init_paging(self.main_frame)
  78. self._init_status(self.main_frame)
  79. self.main_frame.pack(fill='both', expand=True)
  80. def _init_corpus_select(self, parent):
  81. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  82. self.var = StringVar(innerframe)
  83. self.var.set(self.model.DEFAULT_CORPUS)
  84. Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
  85. other_corpora = self.model.CORPORA.keys().remove(self.model.DEFAULT_CORPUS)
  86. om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
  87. om['borderwidth'] = 0
  88. om['highlightthickness'] = 1
  89. om.pack(side='left')
  90. innerframe.pack(side='top', fill='x', anchor='n')
  91. def _init_status(self, parent):
  92. self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
  93. self.status.pack(side='top', anchor='sw')
  94. def _init_menubar(self):
  95. self._result_size = IntVar(self.top)
  96. menubar = Menu(self.top)
  97. filemenu = Menu(menubar, tearoff=0, borderwidth=0)
  98. filemenu.add_command(label='Exit', underline=1,
  99. command=self.destroy, accelerator='Ctrl-q')
  100. menubar.add_cascade(label='File', underline=0, menu=filemenu)
  101. editmenu = Menu(menubar, tearoff=0)
  102. rescntmenu = Menu(editmenu, tearoff=0)
  103. rescntmenu.add_radiobutton(label='20', variable=self._result_size,
  104. underline=0, value=20, command=self.set_result_size)
  105. rescntmenu.add_radiobutton(label='50', variable=self._result_size,
  106. underline=0, value=50, command=self.set_result_size)
  107. rescntmenu.add_radiobutton(label='100', variable=self._result_size,
  108. underline=0, value=100, command=self.set_result_size)
  109. rescntmenu.invoke(1)
  110. editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
  111. menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
  112. self.top.config(menu=menubar)
  113. def set_result_size(self, **kwargs):
  114. self.model.result_count = self._result_size.get()
  115. def _init_results_box(self, parent):
  116. innerframe = Frame(parent)
  117. i1 = Frame(innerframe)
  118. i2 = Frame(innerframe)
  119. vscrollbar = Scrollbar(i1, borderwidth=1)
  120. hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
  121. self.results_box = Text(i1,
  122. font=tkFont.Font(family='courier', size='16'),
  123. state='disabled', borderwidth=1,
  124. yscrollcommand=vscrollbar.set,
  125. xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
  126. self.results_box.pack(side='left', fill='both', expand=True)
  127. vscrollbar.pack(side='left', fill='y', anchor='e')
  128. vscrollbar.config(command=self.results_box.yview)
  129. hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
  130. hscrollbar.config(command=self.results_box.xview)
  131. #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
  132. Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
  133. i1.pack(side='top', fill='both', expand=True, anchor='n')
  134. i2.pack(side='bottom', fill='x', anchor='s')
  135. innerframe.pack(side='top', fill='both', expand=True)
  136. def _init_paging(self, parent):
  137. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  138. self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
  139. prev.pack(side='left', anchor='center')
  140. self.next = next = Button(innerframe, text='Next', command=self.next, width='10', borderwidth=1, highlightthickness=1, state='disabled')
  141. next.pack(side='right', anchor='center')
  142. innerframe.pack(side='top', fill='y')
  143. self.reset_current_page()
  144. def reset_current_page(self):
  145. self.current_page = -1
  146. def _bind_event_handlers(self):
  147. self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
  148. self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
  149. def handle_error_loading_corpus(self, event):
  150. self.status['text'] = 'Error in loading ' + self.var.get()
  151. self.unfreeze_editable()
  152. self.clear_results_box()
  153. self.freeze_editable()
  154. self.reset_current_page()
  155. def handle_corpus_loaded(self, event):
  156. self.status['text'] = self.var.get() + ' is loaded'
  157. self.unfreeze_editable()
  158. self.clear_results_box()
  159. self.reset_current_page()
  160. #self.next()
  161. collocations = self.model.next(self.current_page + 1)
  162. self.write_results(collocations)
  163. self.current_page += 1
  164. def corpus_selected(self, *args):
  165. new_selection = self.var.get()
  166. self.load_corpus(new_selection)
  167. def previous(self):
  168. self.freeze_editable()
  169. collocations = self.model.prev(self.current_page - 1)
  170. self.current_page= self.current_page - 1
  171. self.clear_results_box()
  172. self.write_results(collocations)
  173. self.unfreeze_editable()
  174. def next(self):
  175. self.freeze_editable()
  176. collocations = self.model.next(self.current_page + 1)
  177. self.clear_results_box()
  178. self.write_results(collocations)
  179. self.current_page += 1
  180. self.unfreeze_editable()
  181. def load_corpus(self, selection):
  182. if self.model.selected_corpus != selection:
  183. self.status['text'] = 'Loading ' + selection + '...'
  184. self.freeze_editable()
  185. self.model.load_corpus(selection)
  186. def freeze_editable(self):
  187. self.prev['state'] = 'disabled'
  188. self.next['state'] = 'disabled'
  189. def clear_results_box(self):
  190. self.results_box['state'] = 'normal'
  191. self.results_box.delete("1.0", END)
  192. self.results_box['state'] = 'disabled'
  193. def fire_event(self, event):
  194. #Firing an event so that rendering of widgets happen in the mainloop thread
  195. self.top.event_generate(event, when='tail')
  196. def destroy(self, *e):
  197. if self.top is None: return
  198. self.top.destroy()
  199. self.top = None
  200. def mainloop(self, *args, **kwargs):
  201. if in_idle(): return
  202. self.top.mainloop(*args, **kwargs)
  203. def unfreeze_editable(self):
  204. self.set_paging_button_states()
  205. def set_paging_button_states(self):
  206. if self.current_page == -1 or self.current_page == 0:
  207. self.prev['state'] = 'disabled'
  208. else:
  209. self.prev['state'] = 'normal'
  210. if self.model.is_last_page(self.current_page):
  211. self.next['state'] = 'disabled'
  212. else:
  213. self.next['state'] = 'normal'
  214. def write_results(self, results):
  215. self.results_box['state'] = 'normal'
  216. row = 1
  217. for each in results:
  218. self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
  219. row += 1
  220. self.results_box['state'] = 'disabled'
  221. class CollocationsModel:
  222. def __init__(self):
  223. self.listeners = []
  224. self.result_count = None
  225. self.selected_corpus = None
  226. self.collocations = None
  227. self.CORPORA = _CORPORA
  228. self.DEFAULT_CORPUS = _DEFAULT
  229. self.reset_results()
  230. def reset_results(self):
  231. self.result_pages = []
  232. self.results_returned = 0
  233. def add_listener(self, listener):
  234. self.listeners.append(listener)
  235. def notify_listeners(self, event):
  236. for each in self.listeners:
  237. each.fire_event(event)
  238. def load_corpus(self, name):
  239. self.selected_corpus = name
  240. self.collocations = None
  241. runner_thread = self.LoadCorpus(name, self)
  242. runner_thread.start()
  243. self.reset_results()
  244. def non_default_corpora(self):
  245. copy = []
  246. copy.extend(self.CORPORA.keys())
  247. copy.remove(self.DEFAULT_CORPUS)
  248. copy.sort()
  249. return copy
  250. def is_last_page(self, number):
  251. if number < len(self.result_pages):
  252. return False
  253. return self.results_returned + (number - len(self.result_pages)) * self.result_count >= len(self.collocations)
  254. def next(self, page):
  255. if (len(self.result_pages) - 1) < page:
  256. for i in range(page - (len(self.result_pages) - 1)):
  257. self.result_pages.append(self.collocations[self.results_returned:self.results_returned+self.result_count])
  258. self.results_returned += self.result_count
  259. return self.result_pages[page]
  260. def prev(self, page):
  261. if page == -1:
  262. return []
  263. return self.result_pages[page]
  264. class LoadCorpus(threading.Thread):
  265. def __init__(self, name, model):
  266. threading.Thread.__init__(self)
  267. self.model, self.name = model, name
  268. def run(self):
  269. try:
  270. words = self.model.CORPORA[self.name]()
  271. from operator import itemgetter
  272. text = filter(lambda w: len(w) > 2, words)
  273. fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
  274. vocab = FreqDist(text)
  275. scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(vocab[w1] * vocab[w2])) for w1, w2 in fd]
  276. scored.sort(key=itemgetter(1), reverse=True)
  277. self.model.collocations = map(itemgetter(0), scored)
  278. self.model.notify_listeners(CORPUS_LOADED_EVENT)
  279. except Exception, e:
  280. print e
  281. self.model.notify_listeners(ERROR_LOADING_CORPUS_EVENT)
  282. #def collocations():
  283. # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
  284. def app():
  285. c = CollocationsView()
  286. c.mainloop()
  287. if __name__ == '__main__':
  288. app()
  289. __all__ = ['app']