PageRenderTime 47ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/sem/relextract.py

https://github.com/BrucePHill/nltk
Python | 475 lines | 447 code | 10 blank | 18 comment | 4 complexity | 92762def3f365d97f9a873065a126ee9 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Relation Extraction
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Code for extracting relational triples from the ieer and conll2002 corpora.
  9. Relations are stored internally as dictionaries ('reldicts').
  10. The two serialization outputs are "rtuple" and "clause".
  11. - An rtuple is a tuple of the form ``(subj, filler, obj)``,
  12. where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
  13. occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
  14. circumvent locale variations in rendering utf-8 encoded strings.
  15. - A clause is an atom of the form ``relsym(subjsym, objsym)``,
  16. where the relation, subject and object have been canonicalized to single strings.
  17. """
  18. from __future__ import print_function
  19. # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
  20. from collections import defaultdict
  21. import re
  22. from nltk.compat import htmlentitydefs
  23. # Dictionary that associates corpora with NE classes
  24. NE_CLASSES = {
  25. 'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
  26. 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
  27. 'conll2002': ['LOC', 'PER', 'ORG'],
  28. 'ace': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
  29. 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE', 'FACILITY', 'GPE'],
  30. }
  31. # Allow abbreviated class labels
  32. short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
  33. long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
  34. def _expand(type):
  35. """
  36. Expand an NE class name.
  37. :type type: str
  38. :rtype: str
  39. """
  40. try:
  41. return short2long[type]
  42. except KeyError:
  43. return type
  44. def class_abbrev(type):
  45. """
  46. Abbreviate an NE class name.
  47. :type type: str
  48. :rtype: str
  49. """
  50. try:
  51. return long2short[type]
  52. except KeyError:
  53. return type
  54. def _join(lst, sep=' ', untag=False):
  55. """
  56. Join a list into a string, turning tags tuples into tag strings or just words.
  57. :param untag: if ``True``, omit the tag from tagged input strings.
  58. :type lst: list
  59. :rtype: str
  60. """
  61. try:
  62. return sep.join(lst)
  63. except TypeError:
  64. if untag:
  65. return sep.join(tup[0] for tup in lst)
  66. from nltk.tag import tuple2str
  67. return sep.join(tuple2str(tup) for tup in lst)
  68. def descape_entity(m, defs=htmlentitydefs.entitydefs):
  69. """
  70. Translate one entity to its ISO Latin value.
  71. Inspired by example from effbot.org
  72. """
  73. #s = 'mcglashan_&amp;_sarrail'
  74. #l = ['mcglashan', '&amp;', 'sarrail']
  75. #pattern = re.compile("&(\w+?);")
  76. #new = list2sym(l)
  77. #s = pattern.sub(descape_entity, s)
  78. #print s, new
  79. try:
  80. return defs[m.group(1)]
  81. except KeyError:
  82. return m.group(0) # use as is
  83. def list2sym(lst):
  84. """
  85. Convert a list of strings into a canonical symbol.
  86. :type lst: list
  87. :return: a Unicode string without whitespace
  88. :rtype: unicode
  89. """
  90. sym = _join(lst, '_', untag=True)
  91. sym = sym.lower()
  92. ENT = re.compile("&(\w+?);")
  93. sym = ENT.sub(descape_entity, sym)
  94. sym = sym.replace('.', '')
  95. return sym
  96. def mk_pairs(tree):
  97. """
  98. Group a chunk structure into a list of pairs of the form (list(str), ``Tree``)
  99. In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
  100. identifies pairs whose first member is a list (possibly empty) of terminal
  101. strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
  102. :param tree: a chunk tree
  103. :return: a list of pairs (list(str), ``Tree``)
  104. :rtype: list of tuple
  105. """
  106. from nltk.tree import Tree
  107. pairs = []
  108. pair = [[], None]
  109. for dtr in tree:
  110. if not isinstance(dtr, Tree):
  111. pair[0].append(dtr)
  112. else:
  113. # dtr is a Tree
  114. pair[1] = dtr
  115. pairs.append(pair)
  116. pair = [[], None]
  117. return pairs
  118. def mk_reldicts(pairs, window=5, trace=0):
  119. """
  120. Converts the pairs generated by ``mk_pairs`` into a 'reldict': a dictionary which
  121. stores information about the subject and object NEs plus the filler between them.
  122. Additionally, a left and right context of length =< window are captured (within
  123. a given input sentence).
  124. :param pairs: a pair of list(str) and ``Tree``, as generated by
  125. :param window: a threshold for the number of items to include in the left and right context
  126. :type window: int
  127. :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
  128. :rtype: list(defaultdict)
  129. """
  130. result = []
  131. while len(pairs) > 2:
  132. reldict = defaultdict(str)
  133. reldict['lcon'] = _join(pairs[0][0][-window:])
  134. reldict['subjclass'] = pairs[0][1].node
  135. reldict['subjtext'] = _join(pairs[0][1].leaves())
  136. reldict['subjsym'] = list2sym(pairs[0][1].leaves())
  137. reldict['filler'] = _join(pairs[1][0])
  138. reldict['objclass'] = pairs[1][1].node
  139. reldict['objtext'] = _join(pairs[1][1].leaves())
  140. reldict['objsym'] = list2sym(pairs[1][1].leaves())
  141. reldict['rcon'] = _join(pairs[2][0][:window])
  142. if trace:
  143. print("(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass']))
  144. result.append(reldict)
  145. pairs = pairs[1:]
  146. return result
  147. def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
  148. """
  149. Filter the output of ``mk_reldicts`` according to specified NE classes and a filler pattern.
  150. The parameters ``subjclass`` and ``objclass`` can be used to restrict the
  151. Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
  152. 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
  153. :param subjclass: the class of the subject Named Entity.
  154. :type subjclass: str
  155. :param objclass: the class of the object Named Entity.
  156. :type objclass: str
  157. :param doc: input document
  158. :type doc: ieer document or a list of chunk trees
  159. :param corpus: name of the corpus to take as input; possible values are
  160. 'ieer' and 'conll2002'
  161. :type corpus: str
  162. :param pattern: a regular expression for filtering the fillers of
  163. retrieved triples.
  164. :type pattern: SRE_Pattern
  165. :param window: filters out fillers which exceed this threshold
  166. :type window: int
  167. :return: see ``mk_reldicts``
  168. :rtype: list(defaultdict)
  169. """
  170. if subjclass and subjclass not in NE_CLASSES[corpus]:
  171. if _expand(subjclass) in NE_CLASSES[corpus]:
  172. subjclass = _expand(subjclass)
  173. else:
  174. raise ValueError("your value for the subject type has not been recognized: %s" % subjclass)
  175. if objclass and objclass not in NE_CLASSES[corpus]:
  176. if _expand(objclass) in NE_CLASSES[corpus]:
  177. objclass = _expand(objclass)
  178. else:
  179. raise ValueError("your value for the object type has not been recognized: %s" % objclass)
  180. if corpus == 'ace' or corpus == 'conll2002':
  181. pairs = mk_pairs(doc)
  182. elif corpus == 'ieer':
  183. pairs = mk_pairs(doc.text) + mk_pairs(doc.headline)
  184. else:
  185. raise ValueError("corpus type not recognized")
  186. reldicts = mk_reldicts(pairs)
  187. relfilter = lambda x: (x['subjclass'] == subjclass and
  188. len(x['filler'].split()) <= window and
  189. pattern.match(x['filler']) and
  190. x['objclass'] == objclass)
  191. return list(filter(relfilter, reldicts))
  192. def show_raw_rtuple(reldict, lcon=False, rcon=False):
  193. """
  194. Pretty print the reldict as an rtuple.
  195. :param reldict: a relation dictionary
  196. :type reldict: defaultdict
  197. """
  198. items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']]
  199. format = '[%s: %r] %r [%s: %r]'
  200. if lcon:
  201. items = [reldict['lcon']] + items
  202. format = '...%r)' + format
  203. if rcon:
  204. items.append(reldict['rcon'])
  205. format = format + '(%r...'
  206. printargs = tuple(items)
  207. return format % printargs
  208. def show_clause(reldict, relsym):
  209. """
  210. Print the relation in clausal form.
  211. :param reldict: a relation dictionary
  212. :type reldict: defaultdict
  213. :param relsym: a label for the relation
  214. :type relsym: str
  215. """
  216. items = (relsym, reldict['subjsym'], reldict['objsym'])
  217. return "%s(%r, %r)" % items
  218. #######################################################
  219. # Demos of relation extraction with regular expressions
  220. #######################################################
  221. ############################################
  222. # Example of in(ORG, LOC)
  223. ############################################
  224. def in_demo(trace=0, sql=True):
  225. """
  226. Select pairs of organizations and locations whose mentions occur with an
  227. intervening occurrence of the preposition "in".
  228. If the sql parameter is set to True, then the entity pairs are loaded into
  229. an in-memory database, and subsequently pulled out using an SQL "SELECT"
  230. query.
  231. """
  232. from nltk.corpus import ieer
  233. if sql:
  234. try:
  235. import sqlite3
  236. connection = sqlite3.connect(":memory:")
  237. connection.text_factory = sqlite3.OptimizedUnicode
  238. cur = connection.cursor()
  239. cur.execute("""create table Locations
  240. (OrgName text, LocationName text, DocID text)""")
  241. except ImportError:
  242. import warnings
  243. warnings.warn("Cannot import sqlite; sql flag will be ignored.")
  244. IN = re.compile(r'.*\bin\b(?!\b.+ing)')
  245. print()
  246. print("IEER: in(ORG, LOC) -- just the clauses:")
  247. print("=" * 45)
  248. for file in ieer.fileids():
  249. for doc in ieer.parsed_docs(file):
  250. if trace:
  251. print(doc.docno)
  252. print("=" * 15)
  253. for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
  254. print(show_clause(rel, relsym='IN'))
  255. if sql:
  256. try:
  257. rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
  258. cur.execute("""insert into Locations
  259. values (?, ?, ?)""", rtuple)
  260. connection.commit()
  261. except NameError:
  262. pass
  263. if sql:
  264. try:
  265. cur.execute("""select OrgName from Locations
  266. where LocationName = 'Atlanta'""")
  267. print()
  268. print("Extract data from SQL table: ORGs in Atlanta")
  269. print("-" * 15)
  270. for row in cur:
  271. print(row)
  272. except NameError:
  273. pass
  274. ############################################
  275. # Example of has_role(PER, LOC)
  276. ############################################
  277. def roles_demo(trace=0):
  278. from nltk.corpus import ieer
  279. roles = """
  280. (.*( # assorted roles
  281. analyst|
  282. chair(wo)?man|
  283. commissioner|
  284. counsel|
  285. director|
  286. economist|
  287. editor|
  288. executive|
  289. foreman|
  290. governor|
  291. head|
  292. lawyer|
  293. leader|
  294. librarian).*)|
  295. manager|
  296. partner|
  297. president|
  298. producer|
  299. professor|
  300. researcher|
  301. spokes(wo)?man|
  302. writer|
  303. ,\sof\sthe?\s* # "X, of (the) Y"
  304. """
  305. ROLES = re.compile(roles, re.VERBOSE)
  306. print()
  307. print("IEER: has_role(PER, ORG) -- raw rtuples:")
  308. print("=" * 45)
  309. for file in ieer.fileids():
  310. for doc in ieer.parsed_docs(file):
  311. lcon = rcon = False
  312. if trace:
  313. print(doc.docno)
  314. print("=" * 15)
  315. lcon = rcon = True
  316. for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
  317. print(show_raw_rtuple(rel, lcon=lcon, rcon=rcon))
  318. ##############################################
  319. ### Show what's in the IEER Headlines
  320. ##############################################
  321. def ieer_headlines():
  322. from nltk.corpus import ieer
  323. from nltk.tree import Tree
  324. print("IEER: First 20 Headlines")
  325. print("=" * 45)
  326. trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
  327. for tree in trees[:20]:
  328. print()
  329. print("%s:\n%s" % (doc.docno, tree))
  330. #############################################
  331. ## Dutch CONLL2002: take_on_role(PER, ORG
  332. #############################################
  333. def conllned(trace=1):
  334. """
  335. Find the copula+'van' relation ('of') in the Dutch tagged training corpus
  336. from CoNLL 2002.
  337. """
  338. from nltk.corpus import conll2002
  339. vnv = """
  340. (
  341. is/V| # 3rd sing present and
  342. was/V| # past forms of the verb zijn ('be')
  343. werd/V| # and also present
  344. wordt/V # past of worden ('become)
  345. )
  346. .* # followed by anything
  347. van/Prep # followed by van ('of')
  348. """
  349. VAN = re.compile(vnv, re.VERBOSE)
  350. print()
  351. print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
  352. print("=" * 45)
  353. for doc in conll2002.chunked_sents('ned.train'):
  354. lcon = rcon = False
  355. if trace:
  356. lcon = rcon = True
  357. for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
  358. print(show_raw_rtuple(rel, lcon=True, rcon=True))
  359. #############################################
  360. ## Spanish CONLL2002: (PER, ORG)
  361. #############################################
  362. def conllesp():
  363. from nltk.corpus import conll2002
  364. de = """
  365. .*
  366. (
  367. de/SP|
  368. del/SP
  369. )
  370. """
  371. DE = re.compile(de, re.VERBOSE)
  372. print()
  373. print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
  374. print("=" * 45)
  375. rels = [rel for doc in conll2002.chunked_sents('esp.train')
  376. for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
  377. for r in rels[:10]: print(show_clause(r, relsym='DE'))
  378. print()
  379. def ne_chunked():
  380. IN = re.compile(r'.*\bin\b(?!\b.+ing)')
  381. rels = []
  382. for sent in nltk.corpus.treebank.tagged_sents()[:100]:
  383. sent = nltk.ne_chunk(sent)
  384. print(extract_rels('ORG', 'LOC', sent, corpus='ace', pattern = IN))
  385. if __name__ == '__main__':
  386. import nltk
  387. from nltk.sem import relextract
  388. in_demo(trace=0)
  389. roles_demo(trace=0)
  390. conllned()
  391. conllesp()
  392. ieer_headlines()