PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/nltk/parse/dependencygraph.py

http://nltk.googlecode.com/
Python | 518 lines | 471 code | 13 blank | 34 comment | 24 complexity | d1108d85ed3023fa271024a4e85b15ec MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit: Dependency Grammars
  2. #
  3. # Copyright (C) 2001-2011 NLTK Project
  4. # Author: Jason Narad <jason.narad@gmail.com>
  5. # Steven Bird <sb@csse.unimelb.edu.au> (modifications)
  6. #
  7. # URL: <http://www.nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. #
  10. """
  11. Tools for reading and writing dependency trees.
  12. The input is assumed to be in U{Malt-TAB<http://w3.msi.vxu.se/~nivre/research/MaltXML.html>} format.
  13. Currently only reads the first tree in a file.
  14. """
  15. from nltk.tree import Tree
  16. from pprint import pformat
  17. import re
  18. #################################################################
  19. # DependencyGraph Class
  20. #################################################################
  21. class DependencyGraph(object):
  22. """
  23. A container for the nodes and labelled edges of a dependency structure.
  24. """
  25. def __init__(self, tree_str=None):
  26. """
  27. We place a dummy 'top' node in the first position
  28. in the nodelist, since the root node is often assigned '0'
  29. as its head. This also means that the indexing of the nodelist
  30. corresponds directly to the Malt-TAB format, which starts at 1.
  31. """
  32. top = {'word':None, 'deps':[], 'rel': 'TOP', 'tag': 'TOP', 'address': 0}
  33. self.nodelist = [top]
  34. self.root = None
  35. self.stream = None
  36. if tree_str:
  37. self._parse(tree_str)
  38. def remove_by_address(self, address):
  39. """
  40. Removes the node with the given address. References
  41. to this node in others will still exist.
  42. """
  43. node_index = len(self.nodelist) - 1
  44. while(node_index >= 0):
  45. node = self.nodelist[node_index]
  46. if node['address'] == address:
  47. self.nodelist.pop(node_index)
  48. node_index -= 1
  49. def redirect_arcs(self, originals, redirect):
  50. """
  51. Redirects arcs to any of the nodes in the originals list
  52. to the redirect node address.
  53. """
  54. for node in self.nodelist:
  55. new_deps = []
  56. for dep in node['deps']:
  57. if dep in originals:
  58. new_deps.append(redirect)
  59. else:
  60. new_deps.append(dep)
  61. node['deps'] = new_deps
  62. def add_arc(self, head_address, mod_address):
  63. """
  64. Adds an arc from the node specified by head_address to the
  65. node specified by the mod address.
  66. """
  67. for node in self.nodelist:
  68. if node['address'] == head_address and (mod_address not in node['deps']):
  69. node['deps'].append(mod_address)
  70. def connect_graph(self):
  71. """
  72. Fully connects all non-root nodes. All nodes are set to be dependents
  73. of the root node.
  74. """
  75. for node1 in self.nodelist:
  76. for node2 in self.nodelist:
  77. if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
  78. node1['deps'].append(node2['address'])
  79. # fix error and return
  80. def get_by_address(self, node_address):
  81. """
  82. Returns the node with the given address.
  83. """
  84. for node in self.nodelist:
  85. if node['address'] == node_address:
  86. return node
  87. print 'THROW ERROR: address not found in -get_by_address-'
  88. return -1
  89. def contains_address(self, node_address):
  90. """
  91. Returns true if the graph contains a node with the given node
  92. address, false otherwise.
  93. """
  94. for node in self.nodelist:
  95. if node['address'] == node_address:
  96. return True
  97. return False
  98. def __str__(self):
  99. return pformat(self.nodelist)
  100. def __repr__(self):
  101. return "<DependencyGraph with %d nodes>" % len(self.nodelist)
  102. @staticmethod
  103. def load(file):
  104. """
  105. @param file: a file in Malt-TAB format
  106. """
  107. return DependencyGraph(open(file).read())
  108. @staticmethod
  109. def _normalize(line):
  110. """
  111. Deal with lines in which spaces are used rather than tabs.
  112. """
  113. SPC = re.compile(' +')
  114. return re.sub(SPC, '\t', line).strip()
  115. def left_children(self, node_index):
  116. """
  117. Returns the number of left children under the node specified
  118. by the given address.
  119. """
  120. children = self.nodelist[node_index]['deps']
  121. index = self.nodelist[node_index]['address']
  122. return sum(1 for c in children if c < index)
  123. def right_children(self, node_index):
  124. """
  125. Returns the number of right children under the node specified
  126. by the given address.
  127. """
  128. children = self.nodelist[node_index]['deps']
  129. index = self.nodelist[node_index]['address']
  130. return sum(1 for c in children if c > index)
  131. def add_node(self, node):
  132. if not self.contains_address(node['address']):
  133. self.nodelist.append(node)
  134. def _parse(self, input):
  135. lines = [DependencyGraph._normalize(line) for line in input.split('\n') if line.strip()]
  136. temp = []
  137. for index, line in enumerate(lines):
  138. # print line
  139. try:
  140. cells = line.split('\t')
  141. nrCells = len(cells)
  142. if nrCells == 3:
  143. word, tag, head = cells
  144. rel = ''
  145. elif nrCells == 4:
  146. word, tag, head, rel = cells
  147. elif nrCells == 10:
  148. _, word, _, _, tag, _, head, rel, _, _ = cells
  149. else:
  150. raise ValueError('Number of tab-delimited fields (%d) not supported by CoNLL(10) or Malt-Tab(4) format' % (nrCells))
  151. head = int(head)
  152. self.nodelist.append({'address': index+1, 'word': word, 'tag': tag,
  153. 'head': head, 'rel': rel,
  154. 'deps': [d for (d,h) in temp if h == index+1]})
  155. try:
  156. self.nodelist[head]['deps'].append(index+1)
  157. except IndexError:
  158. temp.append((index+1, head))
  159. except ValueError:
  160. break
  161. root_address = self.nodelist[0]['deps'][0]
  162. self.root = self.nodelist[root_address]
  163. def _word(self, node, filter=True):
  164. w = node['word']
  165. if filter:
  166. if w != ',': return w
  167. return w
  168. def _tree(self, i):
  169. """
  170. Recursive function for turning dependency graphs into
  171. NLTK trees.
  172. @type i: C{int}
  173. @param i: index of a node in C{nodelist}
  174. @return: either a word (if the indexed node
  175. is a leaf) or a L{Tree}.
  176. """
  177. node = self.nodelist[i]
  178. word = node['word']
  179. deps = node['deps']
  180. if len(deps) == 0:
  181. return word
  182. else:
  183. return Tree(word, [self._tree(j) for j in deps])
  184. def tree(self):
  185. """
  186. Starting with the C{root} node, build a dependency tree using the NLTK
  187. L{Tree} constructor. Dependency labels are omitted.
  188. """
  189. node = self.root
  190. word = node['word']
  191. deps = node['deps']
  192. return Tree(word, [self._tree(i) for i in deps])
  193. def _hd(self, i):
  194. try:
  195. return self.nodelist[i]['head']
  196. except IndexError:
  197. return None
  198. def _rel(self, i):
  199. try:
  200. return self.nodelist[i]['rel']
  201. except IndexError:
  202. return None
  203. # what's the return type? Boolean or list?
  204. def contains_cycle(self):
  205. distances = {}
  206. for node in self.nodelist:
  207. for dep in node['deps']:
  208. key = tuple([node['address'], dep]) #'%d -> %d' % (node['address'], dep)
  209. distances[key] = 1
  210. window = 0
  211. for n in range(len(self.nodelist)):
  212. new_entries = {}
  213. for pair1 in distances:
  214. for pair2 in distances:
  215. if pair1[1] == pair2[0]:
  216. key = tuple([pair1[0], pair2[1]])
  217. new_entries[key] = distances[pair1] + distances[pair2]
  218. for pair in new_entries:
  219. distances[pair] = new_entries[pair]
  220. if pair[0] == pair[1]:
  221. print pair[0]
  222. path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) #self.nodelist[pair[0]], pair[0])
  223. return path
  224. return False # return []?
  225. def get_cycle_path(self, curr_node, goal_node_index):
  226. for dep in curr_node['deps']:
  227. if dep == goal_node_index:
  228. return [curr_node['address']]
  229. for dep in curr_node['deps']:
  230. path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)#self.nodelist[dep], goal_node_index)
  231. if len(path) > 0:
  232. path.insert(0, curr_node['address'])
  233. return path
  234. return []
  235. def to_conll(self, style):
  236. """
  237. The dependency graph in CoNLL format.
  238. @param style: the style to use for the format (3, 4, 10 columns)
  239. @type style: C{int}
  240. @rtype: C{str}
  241. """
  242. lines = []
  243. for i, node in enumerate(self.nodelist[1:]):
  244. word, tag, head, rel = node['word'], node['tag'], node['head'], node['rel']
  245. if style == 3:
  246. lines.append('%s\t%s\t%s\n' % (word, tag, head))
  247. elif style == 4:
  248. lines.append('%s\t%s\t%s\t%s\n' % (word, tag, head, rel))
  249. elif style == 10:
  250. lines.append('%s\t%s\t_\t%s\t%s\t_\t%s\t%s\t_\t_\n' % (i+1, word, tag, tag, head, rel))
  251. else:
  252. raise ValueError('Number of tab-delimited fields (%d) not supported by CoNLL(10) or Malt-Tab(4) format' % (style))
  253. return ''.join(lines)
  254. def nx_graph(self):
  255. """
  256. Convert the data in a C{nodelist} into a networkx
  257. labeled directed graph.
  258. @rtype: C{XDigraph}
  259. """
  260. nx_nodelist = range(1, len(self.nodelist))
  261. nx_edgelist = [(n, self._hd(n), self._rel(n))
  262. for n in nx_nodelist if self._hd(n)]
  263. self.nx_labels = {}
  264. for n in nx_nodelist:
  265. self.nx_labels[n] = self.nodelist[n]['word']
  266. g = NX.XDiGraph()
  267. g.add_nodes_from(nx_nodelist)
  268. g.add_edges_from(nx_edgelist)
  269. return g
  270. def demo():
  271. malt_demo()
  272. conll_demo()
  273. conll_file_demo()
  274. cycle_finding_demo()
  275. def malt_demo(nx=False):
  276. """
  277. A demonstration of the result of reading a dependency
  278. version of the first sentence of the Penn Treebank.
  279. """
  280. dg = DependencyGraph("""Pierre NNP 2 NMOD
  281. Vinken NNP 8 SUB
  282. , , 2 P
  283. 61 CD 5 NMOD
  284. years NNS 6 AMOD
  285. old JJ 2 NMOD
  286. , , 2 P
  287. will MD 0 ROOT
  288. join VB 8 VC
  289. the DT 11 NMOD
  290. board NN 9 OBJ
  291. as IN 9 VMOD
  292. a DT 15 NMOD
  293. nonexecutive JJ 15 NMOD
  294. director NN 12 PMOD
  295. Nov. NNP 9 VMOD
  296. 29 CD 16 NMOD
  297. . . 9 VMOD
  298. """)
  299. tree = dg.tree()
  300. print tree.pprint()
  301. if nx:
  302. #currently doesn't work
  303. try:
  304. import networkx as NX
  305. import pylab as P
  306. except ImportError:
  307. raise
  308. g = dg.nx_graph()
  309. g.info()
  310. pos = NX.spring_layout(g, dim=1)
  311. NX.draw_networkx_nodes(g, pos, node_size=50)
  312. #NX.draw_networkx_edges(g, pos, edge_color='k', width=8)
  313. NX.draw_networkx_labels(g, pos, dg.nx_labels)
  314. P.xticks([])
  315. P.yticks([])
  316. P.savefig('tree.png')
  317. P.show()
  318. def conll_demo():
  319. """
  320. A demonstration of how to read a string representation of
  321. a CoNLL format dependency tree.
  322. """
  323. dg = DependencyGraph(conll_data1)
  324. tree = dg.tree()
  325. print tree.pprint()
  326. print dg
  327. print dg.to_conll(4)
  328. def conll_file_demo():
  329. print 'Mass conll_read demo...'
  330. graphs = [DependencyGraph(entry)
  331. for entry in conll_data2.split('\n\n') if entry]
  332. for graph in graphs:
  333. tree = graph.tree()
  334. print '\n' + tree.pprint()
  335. def cycle_finding_demo():
  336. dg = DependencyGraph(treebank_data)
  337. print dg.contains_cycle()
  338. cyclic_dg = DependencyGraph()
  339. top = {'word':None, 'deps':[1], 'rel': 'TOP', 'address': 0}
  340. child1 = {'word':None, 'deps':[2], 'rel': 'NTOP', 'address': 1}
  341. child2 = {'word':None, 'deps':[4], 'rel': 'NTOP', 'address': 2}
  342. child3 = {'word':None, 'deps':[1], 'rel': 'NTOP', 'address': 3}
  343. child4 = {'word':None, 'deps':[3], 'rel': 'NTOP', 'address': 4}
  344. cyclic_dg.nodelist = [top, child1, child2, child3, child4]
  345. cyclic_dg.root = top
  346. print cyclic_dg.contains_cycle()
  347. treebank_data = """Pierre NNP 2 NMOD
  348. Vinken NNP 8 SUB
  349. , , 2 P
  350. 61 CD 5 NMOD
  351. years NNS 6 AMOD
  352. old JJ 2 NMOD
  353. , , 2 P
  354. will MD 0 ROOT
  355. join VB 8 VC
  356. the DT 11 NMOD
  357. board NN 9 OBJ
  358. as IN 9 VMOD
  359. a DT 15 NMOD
  360. nonexecutive JJ 15 NMOD
  361. director NN 12 PMOD
  362. Nov. NNP 9 VMOD
  363. 29 CD 16 NMOD
  364. . . 9 VMOD
  365. """
  366. conll_data1 = """
  367. 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
  368. 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
  369. 3 met met Prep Prep voor 8 mod _ _
  370. 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
  371. 5 moeder moeder N N soort|ev|neut 3 obj1 _ _
  372. 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
  373. 7 gaan ga V V hulp|inf 6 vc _ _
  374. 8 winkelen winkel V V intrans|inf 11 cnj _ _
  375. 9 , , Punc Punc komma 8 punct _ _
  376. 10 zwemmen zwem V V intrans|inf 11 cnj _ _
  377. 11 of of Conj Conj neven 7 vc _ _
  378. 12 terrassen terras N N soort|mv|neut 11 cnj _ _
  379. 13 . . Punc Punc punt 12 punct _ _
  380. """
  381. conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _
  382. 2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _
  383. 3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _
  384. 4 wild wild Adj Adj attr|stell|onverv 5 mod _ _
  385. 5 zwaaien zwaai N N soort|mv|neut 2 vc _ _
  386. 6 . . Punc Punc punt 5 punct _ _
  387. 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
  388. 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
  389. 3 met met Prep Prep voor 8 mod _ _
  390. 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
  391. 5 moeder moeder N N soort|ev|neut 3 obj1 _ _
  392. 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
  393. 7 gaan ga V V hulp|inf 6 vc _ _
  394. 8 winkelen winkel V V intrans|inf 11 cnj _ _
  395. 9 , , Punc Punc komma 8 punct _ _
  396. 10 zwemmen zwem V V intrans|inf 11 cnj _ _
  397. 11 of of Conj Conj neven 7 vc _ _
  398. 12 terrassen terras N N soort|mv|neut 11 cnj _ _
  399. 13 . . Punc Punc punt 12 punct _ _
  400. 1 Dat dat Pron Pron aanw|neut|attr 2 det _ _
  401. 2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _
  402. 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
  403. 4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _
  404. 5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _
  405. 6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _
  406. 7 . . Punc Punc punt 6 punct _ _
  407. 1 Het het Pron Pron onbep|neut|zelfst 2 su _ _
  408. 2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _
  409. 3 bij bij Prep Prep voor 2 ld _ _
  410. 4 de de Art Art bep|zijdofmv|neut 6 det _ _
  411. 5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _
  412. 6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _
  413. 7 die die Pron Pron betr|neut|zelfst 6 mod _ _
  414. 8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _
  415. 9 ginds ginds Adv Adv gew|aanw 12 mod _ _
  416. 10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _
  417. 11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _
  418. 12 gelaten laat V V trans|verldw|onverv 11 vc _ _
  419. 13 . . Punc Punc punt 12 punct _ _
  420. 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
  421. 2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _
  422. 3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _
  423. 4 naast naast Prep Prep voor 11 mod _ _
  424. 5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _
  425. 6 op op Prep Prep voor 11 ld _ _
  426. 7 de de Art Art bep|zijdofmv|neut 8 det _ _
  427. 8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _
  428. 9 kunnen kan V V hulp|inf 2 vc _ _
  429. 10 gaan ga V V hulp|inf 9 vc _ _
  430. 11 liggen lig V V intrans|inf 10 vc _ _
  431. 12 . . Punc Punc punt 11 punct _ _
  432. 1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _
  433. 2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _
  434. 3 mams mams N N soort|ev|neut 4 det _ _
  435. 4 rug rug N N soort|ev|neut 5 obj1 _ _
  436. 5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _
  437. 6 hebben heb V V hulp|inf 2 vc _ _
  438. 7 en en Conj Conj neven 0 ROOT _ _
  439. 8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _
  440. 9 de de Art Art bep|zijdofmv|neut 10 det _ _
  441. 10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _
  442. 11 . . Punc Punc punt 10 punct _ _
  443. 1 Of of Conj Conj onder|metfin 0 ROOT _ _
  444. 2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _
  445. 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
  446. 4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _
  447. 5 met met Prep Prep voor 10 mod _ _
  448. 6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _
  449. 7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _
  450. 8 rond rond Adv Adv deelv 10 svp _ _
  451. 9 kunnen kan V V hulp|inf 3 vc _ _
  452. 10 slenteren slenter V V intrans|inf 9 vc _ _
  453. 11 in in Prep Prep voor 10 mod _ _
  454. 12 de de Art Art bep|zijdofmv|neut 13 det _ _
  455. 13 buurt buurt N N soort|ev|neut 11 obj1 _ _
  456. 14 van van Prep Prep voor 13 mod _ _
  457. 15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _
  458. 16 . . Punc Punc punt 15 punct _ _
  459. """
  460. if __name__ == '__main__':
  461. demo()