PageRenderTime 64ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/nltk/chunk/regexp.py

http://nltk.googlecode.com/
Python | 1365 lines | 1317 code | 10 blank | 38 comment | 15 complexity | 7cfc757bc60ea83d7ea6e3d7d643aa66 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. # Natural Language Toolkit: Regular Expression Chunkers
  2. #
  3. # Copyright (C) 2001-2011 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # Steven Bird <sb@csse.unimelb.edu.au> (minor additions)
  6. # URL: <http://www.nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. import re
  9. import types
  10. from nltk.tree import Tree
  11. from nltk.chunk.api import *
  12. from nltk.chunk.util import *
  13. ##//////////////////////////////////////////////////////
  14. ## ChunkString
  15. ##//////////////////////////////////////////////////////
  16. class ChunkString(object):
  17. """
  18. A string-based encoding of a particular chunking of a text.
  19. Internally, the C{ChunkString} class uses a single string to
  20. encode the chunking of the input text. This string contains a
  21. sequence of angle-bracket delimited tags, with chunking indicated
  22. by braces. An example of this encoding is::
  23. {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
  24. C{ChunkString} are created from tagged texts (i.e., C{list}s of
  25. C{tokens} whose type is C{TaggedType}). Initially, nothing is
  26. chunked.
  27. The chunking of a C{ChunkString} can be modified with the C{xform}
  28. method, which uses a regular expression to transform the string
  29. representation. These transformations should only add and remove
  30. braces; they should I{not} modify the sequence of angle-bracket
  31. delimited tags.
  32. @type _str: C{string}
  33. @ivar _str: The internal string representation of the text's
  34. encoding. This string representation contains a sequence of
  35. angle-bracket delimited tags, with chunking indicated by
  36. braces. An example of this encoding is::
  37. {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
  38. @type _pieces: C{list} of pieces (tagged tokens and chunks)
  39. @ivar _pieces: The tagged tokens and chunks encoded by this C{ChunkString}.
  40. @ivar _debug: The debug level. See the constructor docs.
  41. @cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
  42. will only match positions that are in chunks.
  43. @cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
  44. will only match positions that are in chinks.
  45. """
  46. CHUNK_TAG_CHAR = r'[^\{\}<>]'
  47. CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
  48. IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
  49. IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
  50. # These are used by _verify
  51. _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
  52. _CHINK = r'(%s+?)+?' % CHUNK_TAG
  53. _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
  54. _BRACKETS = re.compile('[^\{\}]+')
  55. _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
  56. def __init__(self, chunk_struct, debug_level=1):
  57. """
  58. Construct a new C{ChunkString} that encodes the chunking of
  59. the text C{tagged_tokens}.
  60. @type chunk_struct: C{Tree}
  61. @param chunk_struct: The chunk structure to be further chunked.
  62. @type debug_level: int
  63. @param debug_level: The level of debugging which should be
  64. applied to transformations on the C{ChunkString}. The
  65. valid levels are:
  66. - 0: no checks
  67. - 1: full check on to_chunkstruct
  68. - 2: full check on to_chunkstruct and cursory check after
  69. each transformation.
  70. - 3: full check on to_chunkstruct and full check after
  71. each transformation.
  72. We recommend you use at least level 1. You should
  73. probably use level 3 if you use any non-standard
  74. subclasses of C{RegexpChunkRule}.
  75. """
  76. self._top_node = chunk_struct.node
  77. self._pieces = chunk_struct[:]
  78. tags = [self._tag(tok) for tok in self._pieces]
  79. self._str = '<' + '><'.join(tags) + '>'
  80. self._debug = debug_level
  81. def _tag(self, tok):
  82. if type(tok) == types.TupleType:
  83. return tok[1]
  84. elif isinstance(tok, Tree):
  85. return tok.node
  86. else:
  87. raise ValueError('chunk structures must contain tagged '
  88. 'tokens or trees')
  89. def _verify(self, s, verify_tags):
  90. """
  91. Check to make sure that C{s} still corresponds to some chunked
  92. version of C{_pieces}.
  93. @type verify_tags: C{boolean}
  94. @param verify_tags: Whether the individual tags should be
  95. checked. If this is false, C{_verify} will check to make
  96. sure that C{_str} encodes a chunked version of I{some}
  97. list of tokens. If this is true, then C{_verify} will
  98. check to make sure that the tags in C{_str} match those in
  99. C{_pieces}.
  100. @raise ValueError: if this C{ChunkString}'s internal string
  101. representation is invalid or not consistent with _pieces.
  102. """
  103. # Check overall form
  104. if not ChunkString._VALID.match(s):
  105. raise ValueError('Transformation generated invalid '
  106. 'chunkstring:\n %s' % s)
  107. # Check that parens are balanced. If the string is long, we
  108. # have to do this in pieces, to avoid a maximum recursion
  109. # depth limit for regular expressions.
  110. brackets = ChunkString._BRACKETS.sub('', s)
  111. for i in range(1+len(brackets)/5000):
  112. substr = brackets[i*5000:i*5000+5000]
  113. if not ChunkString._BALANCED_BRACKETS.match(substr):
  114. raise ValueError('Transformation generated invalid '
  115. 'chunkstring:\n %s' % s)
  116. if verify_tags<=0: return
  117. tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
  118. tags2 = [self._tag(piece) for piece in self._pieces]
  119. if tags1 != tags2:
  120. raise ValueError('Transformation generated invalid '
  121. 'chunkstring: tag changed')
  122. def to_chunkstruct(self, chunk_node='CHUNK'):
  123. """
  124. @return: the chunk structure encoded by this C{ChunkString}.
  125. @rtype: C{Tree}
  126. @raise ValueError: If a transformation has generated an
  127. invalid chunkstring.
  128. """
  129. if self._debug > 0: self._verify(self._str, 1)
  130. # Use this alternating list to create the chunkstruct.
  131. pieces = []
  132. index = 0
  133. piece_in_chunk = 0
  134. for piece in re.split('[{}]', self._str):
  135. # Find the list of tokens contained in this piece.
  136. length = piece.count('<')
  137. subsequence = self._pieces[index:index+length]
  138. # Add this list of tokens to our pieces.
  139. if piece_in_chunk:
  140. pieces.append(Tree(chunk_node, subsequence))
  141. else:
  142. pieces += subsequence
  143. # Update index, piece_in_chunk
  144. index += length
  145. piece_in_chunk = not piece_in_chunk
  146. return Tree(self._top_node, pieces)
  147. def xform(self, regexp, repl):
  148. """
  149. Apply the given transformation to this C{ChunkString}'s string
  150. encoding. In particular, find all occurrences that match
  151. C{regexp}, and replace them using C{repl} (as done by
  152. C{re.sub}).
  153. This transformation should only add and remove braces; it
  154. should I{not} modify the sequence of angle-bracket delimited
  155. tags. Furthermore, this transformation may not result in
  156. improper bracketing. Note, in particular, that bracketing may
  157. not be nested.
  158. @type regexp: C{string} or C{regexp}
  159. @param regexp: A regular expression matching the substring
  160. that should be replaced. This will typically include a
  161. named group, which can be used by C{repl}.
  162. @type repl: C{string}
  163. @param repl: An expression specifying what should replace the
  164. matched substring. Typically, this will include a named
  165. replacement group, specified by C{regexp}.
  166. @rtype: C{None}
  167. @raise ValueError: If this transformation generated an
  168. invalid chunkstring.
  169. """
  170. # Do the actual substitution
  171. s = re.sub(regexp, repl, self._str)
  172. # The substitution might have generated "empty chunks"
  173. # (substrings of the form "{}"). Remove them, so they don't
  174. # interfere with other transformations.
  175. s = re.sub('\{\}', '', s)
  176. # Make sure that the transformation was legal.
  177. if self._debug > 1: self._verify(s, self._debug-2)
  178. # Commit the transformation.
  179. self._str = s
  180. def __repr__(self):
  181. """
  182. @rtype: C{string}
  183. @return: A string representation of this C{ChunkString}. This
  184. string representation has the form::
  185. <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
  186. """
  187. return '<ChunkString: %s>' % `self._str`
  188. def __str__(self):
  189. """
  190. @rtype: C{string}
  191. @return: A formatted representation of this C{ChunkString}'s
  192. string encoding. This representation will include extra
  193. spaces to ensure that tags will line up with the
  194. representation of other C{ChunkStrings} for the same text,
  195. regardless of the chunking.
  196. """
  197. # Add spaces to make everything line up.
  198. str = re.sub(r'>(?!\})', r'> ', self._str)
  199. str = re.sub(r'([^\{])<', r'\1 <', str)
  200. if str[0] == '<': str = ' ' + str
  201. return str
  202. ##//////////////////////////////////////////////////////
  203. ## Chunking Rules
  204. ##//////////////////////////////////////////////////////
  205. class RegexpChunkRule(object):
  206. """
  207. A rule specifying how to modify the chunking in a C{ChunkString},
  208. using a transformational regular expression. The
  209. C{RegexpChunkRule} class itself can be used to implement any
  210. transformational rule based on regular expressions. There are
  211. also a number of subclasses, which can be used to implement
  212. simpler types of rules, based on matching regular expressions.
  213. Each C{RegexpChunkRule} has a regular expression and a
  214. replacement expression. When a C{RegexpChunkRule} is X{applied}
  215. to a C{ChunkString}, it searches the C{ChunkString} for any
  216. substring that matches the regular expression, and replaces it
  217. using the replacement expression. This search/replace operation
  218. has the same semantics as C{re.sub}.
  219. Each C{RegexpChunkRule} also has a description string, which
  220. gives a short (typically less than 75 characters) description of
  221. the purpose of the rule.
  222. This transformation defined by this C{RegexpChunkRule} should
  223. only add and remove braces; it should I{not} modify the sequence
  224. of angle-bracket delimited tags. Furthermore, this transformation
  225. may not result in nested or mismatched bracketing.
  226. """
  227. def __init__(self, regexp, repl, descr):
  228. """
  229. Construct a new RegexpChunkRule.
  230. @type regexp: C{regexp} or C{string}
  231. @param regexp: This C{RegexpChunkRule}'s regular expression.
  232. When this rule is applied to a C{ChunkString}, any
  233. substring that matches C{regexp} will be replaced using
  234. the replacement string C{repl}. Note that this must be a
  235. normal regular expression, not a tag pattern.
  236. @type repl: C{string}
  237. @param repl: This C{RegexpChunkRule}'s replacement
  238. expression. When this rule is applied to a
  239. C{ChunkString}, any substring that matches C{regexp} will
  240. be replaced using C{repl}.
  241. @type descr: C{string}
  242. @param descr: A short description of the purpose and/or effect
  243. of this rule.
  244. """
  245. if isinstance(regexp, basestring):
  246. regexp = re.compile(regexp)
  247. self._repl = repl
  248. self._descr = descr
  249. self._regexp = regexp
  250. def apply(self, chunkstr):
  251. # Keep docstring generic so we can inherit it.
  252. """
  253. Apply this rule to the given C{ChunkString}. See the
  254. class reference documentation for a description of what it
  255. means to apply a rule.
  256. @type chunkstr: C{ChunkString}
  257. @param chunkstr: The chunkstring to which this rule is
  258. applied.
  259. @rtype: C{None}
  260. @raise ValueError: If this transformation generated an
  261. invalid chunkstring.
  262. """
  263. chunkstr.xform(self._regexp, self._repl)
  264. def descr(self):
  265. """
  266. @rtype: C{string}
  267. @return: a short description of the purpose and/or effect of
  268. this rule.
  269. """
  270. return self._descr
  271. def __repr__(self):
  272. """
  273. @rtype: C{string}
  274. @return: A string representation of this rule. This
  275. string representation has the form::
  276. <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
  277. Note that this representation does not include the
  278. description string; that string can be accessed
  279. separately with the C{descr} method.
  280. """
  281. return ('<RegexpChunkRule: '+`self._regexp.pattern`+
  282. '->'+`self._repl`+'>')
  283. @staticmethod
  284. def parse(s):
  285. """
  286. Create a RegexpChunkRule from a string description.
  287. Currently, the following formats are supported::
  288. {regexp} # chunk rule
  289. }regexp{ # chink rule
  290. regexp}{regexp # split rule
  291. regexp{}regexp # merge rule
  292. Where C{regexp} is a regular expression for the rule. Any
  293. text following the comment marker (C{#}) will be used as
  294. the rule's description:
  295. >>> RegexpChunkRule.parse('{<DT>?<NN.*>+}')
  296. <ChunkRule: '<DT>?<NN.*>+'>
  297. """
  298. # Split off the comment (but don't split on '\#')
  299. m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
  300. rule = m.group('rule').strip()
  301. comment = (m.group('comment') or '')[1:].strip()
  302. # Pattern bodies: chunk, chink, split, merge
  303. try:
  304. if not rule:
  305. raise ValueError('Empty chunk pattern')
  306. if rule[0] == '{' and rule[-1] == '}':
  307. return ChunkRule(rule[1:-1], comment)
  308. elif rule[0] == '}' and rule[-1] == '{':
  309. return ChinkRule(rule[1:-1], comment)
  310. elif '}{' in rule:
  311. left, right = rule.split('}{')
  312. return SplitRule(left, right, comment)
  313. elif '{}' in rule:
  314. left, right = rule.split('{}')
  315. return MergeRule(left, right, comment)
  316. elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
  317. left, chunk, right = re.split('[{}]', rule)
  318. return ChunkRuleWithContext(left, chunk, right, comment)
  319. else:
  320. raise ValueError('Illegal chunk pattern: %s' % rule)
  321. except (ValueError, re.error):
  322. raise ValueError('Illegal chunk pattern: %s' % rule)
  323. class ChunkRule(RegexpChunkRule):
  324. """
  325. A rule specifying how to add chunks to a C{ChunkString}, using a
  326. matching tag pattern. When applied to a C{ChunkString}, it will
  327. find any substring that matches this tag pattern and that is not
  328. already part of a chunk, and create a new chunk containing that
  329. substring.
  330. """
  331. def __init__(self, tag_pattern, descr):
  332. """
  333. Construct a new C{ChunkRule}.
  334. @type tag_pattern: C{string}
  335. @param tag_pattern: This rule's tag pattern. When
  336. applied to a C{ChunkString}, this rule will
  337. chunk any substring that matches this tag pattern and that
  338. is not already part of a chunk.
  339. @type descr: C{string}
  340. @param descr: A short description of the purpose and/or effect
  341. of this rule.
  342. """
  343. self._pattern = tag_pattern
  344. regexp = re.compile('(?P<chunk>%s)%s' %
  345. (tag_pattern2re_pattern(tag_pattern),
  346. ChunkString.IN_CHINK_PATTERN))
  347. RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
  348. def __repr__(self):
  349. """
  350. @rtype: C{string}
  351. @return: A string representation of this rule. This
  352. string representation has the form::
  353. <ChunkRule: '<IN|VB.*>'>
  354. Note that this representation does not include the
  355. description string; that string can be accessed
  356. separately with the C{descr} method.
  357. """
  358. return '<ChunkRule: '+`self._pattern`+'>'
  359. class ChinkRule(RegexpChunkRule):
  360. """
  361. A rule specifying how to remove chinks to a C{ChunkString},
  362. using a matching tag pattern. When applied to a
  363. C{ChunkString}, it will find any substring that matches this
  364. tag pattern and that is contained in a chunk, and remove it
  365. from that chunk, thus creating two new chunks.
  366. """
  367. def __init__(self, tag_pattern, descr):
  368. """
  369. Construct a new C{ChinkRule}.
  370. @type tag_pattern: C{string}
  371. @param tag_pattern: This rule's tag pattern. When
  372. applied to a C{ChunkString}, this rule will
  373. find any substring that matches this tag pattern and that
  374. is contained in a chunk, and remove it from that chunk,
  375. thus creating two new chunks.
  376. @type descr: C{string}
  377. @param descr: A short description of the purpose and/or effect
  378. of this rule.
  379. """
  380. self._pattern = tag_pattern
  381. regexp = re.compile('(?P<chink>%s)%s' %
  382. (tag_pattern2re_pattern(tag_pattern),
  383. ChunkString.IN_CHUNK_PATTERN))
  384. RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
  385. def __repr__(self):
  386. """
  387. @rtype: C{string}
  388. @return: A string representation of this rule. This
  389. string representation has the form::
  390. <ChinkRule: '<IN|VB.*>'>
  391. Note that this representation does not include the
  392. description string; that string can be accessed
  393. separately with the C{descr} method.
  394. """
  395. return '<ChinkRule: '+`self._pattern`+'>'
  396. class UnChunkRule(RegexpChunkRule):
  397. """
  398. A rule specifying how to remove chunks to a C{ChunkString},
  399. using a matching tag pattern. When applied to a
  400. C{ChunkString}, it will find any complete chunk that matches this
  401. tag pattern, and un-chunk it.
  402. """
  403. def __init__(self, tag_pattern, descr):
  404. """
  405. Construct a new C{UnChunkRule}.
  406. @type tag_pattern: C{string}
  407. @param tag_pattern: This rule's tag pattern. When
  408. applied to a C{ChunkString}, this rule will
  409. find any complete chunk that matches this tag pattern,
  410. and un-chunk it.
  411. @type descr: C{string}
  412. @param descr: A short description of the purpose and/or effect
  413. of this rule.
  414. """
  415. self._pattern = tag_pattern
  416. regexp = re.compile('\{(?P<chunk>%s)\}' %
  417. tag_pattern2re_pattern(tag_pattern))
  418. RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
  419. def __repr__(self):
  420. """
  421. @rtype: C{string}
  422. @return: A string representation of this rule. This
  423. string representation has the form::
  424. <UnChunkRule: '<IN|VB.*>'>
  425. Note that this representation does not include the
  426. description string; that string can be accessed
  427. separately with the C{descr} method.
  428. """
  429. return '<UnChunkRule: '+`self._pattern`+'>'
  430. class MergeRule(RegexpChunkRule):
  431. """
  432. A rule specifying how to merge chunks in a C{ChunkString}, using
  433. two matching tag patterns: a left pattern, and a right pattern.
  434. When applied to a C{ChunkString}, it will find any chunk whose end
  435. matches left pattern, and immediately followed by a chunk whose
  436. beginning matches right pattern. It will then merge those two
  437. chunks into a single chunk.
  438. """
  439. def __init__(self, left_tag_pattern, right_tag_pattern, descr):
  440. """
  441. Construct a new C{MergeRule}.
  442. @type right_tag_pattern: C{string}
  443. @param right_tag_pattern: This rule's right tag
  444. pattern. When applied to a C{ChunkString}, this
  445. rule will find any chunk whose end matches
  446. C{left_tag_pattern}, and immediately followed by a chunk
  447. whose beginning matches this pattern. It will
  448. then merge those two chunks into a single chunk.
  449. @type left_tag_pattern: C{string}
  450. @param left_tag_pattern: This rule's left tag
  451. pattern. When applied to a C{ChunkString}, this
  452. rule will find any chunk whose end matches
  453. this pattern, and immediately followed by a chunk
  454. whose beginning matches C{right_tag_pattern}. It will
  455. then merge those two chunks into a single chunk.
  456. @type descr: C{string}
  457. @param descr: A short description of the purpose and/or effect
  458. of this rule.
  459. """
  460. # Ensure that the individual patterns are coherent. E.g., if
  461. # left='(' and right=')', then this will raise an exception:
  462. re.compile(tag_pattern2re_pattern(left_tag_pattern))
  463. re.compile(tag_pattern2re_pattern(right_tag_pattern))
  464. self._left_tag_pattern = left_tag_pattern
  465. self._right_tag_pattern = right_tag_pattern
  466. regexp = re.compile('(?P<left>%s)}{(?=%s)' %
  467. (tag_pattern2re_pattern(left_tag_pattern),
  468. tag_pattern2re_pattern(right_tag_pattern)))
  469. RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
  470. def __repr__(self):
  471. """
  472. @rtype: C{string}
  473. @return: A string representation of this rule. This
  474. string representation has the form::
  475. <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
  476. Note that this representation does not include the
  477. description string; that string can be accessed
  478. separately with the C{descr} method.
  479. """
  480. return ('<MergeRule: '+`self._left_tag_pattern`+', '+
  481. `self._right_tag_pattern`+'>')
  482. class SplitRule(RegexpChunkRule):
  483. """
  484. A rule specifying how to split chunks in a C{ChunkString}, using
  485. two matching tag patterns: a left pattern, and a right pattern.
  486. When applied to a C{ChunkString}, it will find any chunk that
  487. matches the left pattern followed by the right pattern. It will
  488. then split the chunk into two new chunks, at the point between the
  489. two pattern matches.
  490. """
  491. def __init__(self, left_tag_pattern, right_tag_pattern, descr):
  492. """
  493. Construct a new C{SplitRule}.
  494. @type right_tag_pattern: C{string}
  495. @param right_tag_pattern: This rule's right tag
  496. pattern. When applied to a C{ChunkString}, this rule will
  497. find any chunk containing a substring that matches
  498. C{left_tag_pattern} followed by this pattern. It will
  499. then split the chunk into two new chunks at the point
  500. between these two matching patterns.
  501. @type left_tag_pattern: C{string}
  502. @param left_tag_pattern: This rule's left tag
  503. pattern. When applied to a C{ChunkString}, this rule will
  504. find any chunk containing a substring that matches this
  505. pattern followed by C{right_tag_pattern}. It will then
  506. split the chunk into two new chunks at the point between
  507. these two matching patterns.
  508. @type descr: C{string}
  509. @param descr: A short description of the purpose and/or effect
  510. of this rule.
  511. """
  512. # Ensure that the individual patterns are coherent. E.g., if
  513. # left='(' and right=')', then this will raise an exception:
  514. re.compile(tag_pattern2re_pattern(left_tag_pattern))
  515. re.compile(tag_pattern2re_pattern(right_tag_pattern))
  516. self._left_tag_pattern = left_tag_pattern
  517. self._right_tag_pattern = right_tag_pattern
  518. regexp = re.compile('(?P<left>%s)(?=%s)' %
  519. (tag_pattern2re_pattern(left_tag_pattern),
  520. tag_pattern2re_pattern(right_tag_pattern)))
  521. RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
  522. def __repr__(self):
  523. """
  524. @rtype: C{string}
  525. @return: A string representation of this rule. This
  526. string representation has the form::
  527. <SplitRule: '<NN>', '<DT>'>
  528. Note that this representation does not include the
  529. description string; that string can be accessed
  530. separately with the C{descr} method.
  531. """
  532. return ('<SplitRule: '+`self._left_tag_pattern`+', '+
  533. `self._right_tag_pattern`+'>')
  534. class ExpandLeftRule(RegexpChunkRule):
  535. """
  536. A rule specifying how to expand chunks in a C{ChunkString} to the left,
  537. using two matching tag patterns: a left pattern, and a right pattern.
  538. When applied to a C{ChunkString}, it will find any chunk whose beginning
  539. matches right pattern, and immediately preceded by a chink whose
  540. end matches left pattern. It will then expand the chunk to incorporate
  541. the new material on the left.
  542. """
  543. def __init__(self, left_tag_pattern, right_tag_pattern, descr):
  544. """
  545. Construct a new C{ExpandRightRule}.
  546. @type right_tag_pattern: C{string}
  547. @param right_tag_pattern: This rule's right tag
  548. pattern. When applied to a C{ChunkString}, this
  549. rule will find any chunk whose beginning matches
  550. C{right_tag_pattern}, and immediately preceded by a chink
  551. whose end matches this pattern. It will
  552. then merge those two chunks into a single chunk.
  553. @type left_tag_pattern: C{string}
  554. @param left_tag_pattern: This rule's left tag
  555. pattern. When applied to a C{ChunkString}, this
  556. rule will find any chunk whose beginning matches
  557. this pattern, and immediately preceded by a chink
  558. whose end matches C{left_tag_pattern}. It will
  559. then expand the chunk to incorporate the new material on the left.
  560. @type descr: C{string}
  561. @param descr: A short description of the purpose and/or effect
  562. of this rule.
  563. """
  564. # Ensure that the individual patterns are coherent. E.g., if
  565. # left='(' and right=')', then this will raise an exception:
  566. re.compile(tag_pattern2re_pattern(left_tag_pattern))
  567. re.compile(tag_pattern2re_pattern(right_tag_pattern))
  568. self._left_tag_pattern = left_tag_pattern
  569. self._right_tag_pattern = right_tag_pattern
  570. regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
  571. (tag_pattern2re_pattern(left_tag_pattern),
  572. tag_pattern2re_pattern(right_tag_pattern)))
  573. RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
  574. def __repr__(self):
  575. """
  576. @rtype: C{string}
  577. @return: A string representation of this rule. This
  578. string representation has the form::
  579. <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
  580. Note that this representation does not include the
  581. description string; that string can be accessed
  582. separately with the C{descr} method.
  583. """
  584. return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+
  585. `self._right_tag_pattern`+'>')
  586. class ExpandRightRule(RegexpChunkRule):
  587. """
  588. A rule specifying how to expand chunks in a C{ChunkString} to the
  589. right, using two matching tag patterns: a left pattern, and a
  590. right pattern. When applied to a C{ChunkString}, it will find any
  591. chunk whose end matches left pattern, and immediately followed by
  592. a chink whose beginning matches right pattern. It will then
  593. expand the chunk to incorporate the new material on the right.
  594. """
  595. def __init__(self, left_tag_pattern, right_tag_pattern, descr):
  596. """
  597. Construct a new C{ExpandRightRule}.
  598. @type right_tag_pattern: C{string}
  599. @param right_tag_pattern: This rule's right tag
  600. pattern. When applied to a C{ChunkString}, this
  601. rule will find any chunk whose end matches
  602. C{left_tag_pattern}, and immediately followed by a chink
  603. whose beginning matches this pattern. It will
  604. then merge those two chunks into a single chunk.
  605. @type left_tag_pattern: C{string}
  606. @param left_tag_pattern: This rule's left tag
  607. pattern. When applied to a C{ChunkString}, this
  608. rule will find any chunk whose end matches
  609. this pattern, and immediately followed by a chink
  610. whose beginning matches C{right_tag_pattern}. It will
  611. then expand the chunk to incorporate the new material on the right.
  612. @type descr: C{string}
  613. @param descr: A short description of the purpose and/or effect
  614. of this rule.
  615. """
  616. # Ensure that the individual patterns are coherent. E.g., if
  617. # left='(' and right=')', then this will raise an exception:
  618. re.compile(tag_pattern2re_pattern(left_tag_pattern))
  619. re.compile(tag_pattern2re_pattern(right_tag_pattern))
  620. self._left_tag_pattern = left_tag_pattern
  621. self._right_tag_pattern = right_tag_pattern
  622. regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
  623. (tag_pattern2re_pattern(left_tag_pattern),
  624. tag_pattern2re_pattern(right_tag_pattern)))
  625. RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
  626. def __repr__(self):
  627. """
  628. @rtype: C{string}
  629. @return: A string representation of this rule. This
  630. string representation has the form::
  631. <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
  632. Note that this representation does not include the
  633. description string; that string can be accessed
  634. separately with the C{descr} method.
  635. """
  636. return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+
  637. `self._right_tag_pattern`+'>')
  638. class ChunkRuleWithContext(RegexpChunkRule):
  639. """
  640. A rule specifying how to add chunks to a C{ChunkString}, using
  641. three matching tag patterns: one for the left context, one for the
  642. chunk, and one for the right context. When applied to a
  643. C{ChunkString}, it will find any substring that matches the chunk
  644. tag pattern, is surrounded by substrings that match the two
  645. context patterns, and is not already part of a chunk; and create a
  646. new chunk containing the substring that matched the chunk tag
  647. pattern.
  648. Caveat: Both the left and right context are consumed when this
  649. rule matches; therefore, if you need to find overlapping matches,
  650. you will need to apply your rule more than once.
  651. """
  652. def __init__(self, left_context_tag_pattern, chunk_tag_pattern,
  653. right_context_tag_pattern, descr):
  654. """
  655. Construct a new C{ChunkRuleWithContext}.
  656. @type left_context_tag_pattern: C{string}
  657. @param left_context_tag_pattern: A tag pattern that must match
  658. the left context of C{chunk_tag_pattern} for this rule to
  659. apply.
  660. @type chunk_tag_pattern: C{string}
  661. @param chunk_tag_pattern: A tag pattern that must match for this
  662. rule to apply. If the rule does apply, then this pattern
  663. also identifies the substring that will be made into a chunk.
  664. @type right_context_tag_pattern: C{string}
  665. @param right_context_tag_pattern: A tag pattern that must match
  666. the right context of C{chunk_tag_pattern} for this rule to
  667. apply.
  668. @type descr: C{string}
  669. @param descr: A short description of the purpose and/or effect
  670. of this rule.
  671. """
  672. # Ensure that the individual patterns are coherent. E.g., if
  673. # left='(' and right=')', then this will raise an exception:
  674. re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
  675. re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
  676. re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
  677. self._left_context_tag_pattern = left_context_tag_pattern
  678. self._chunk_tag_pattern = chunk_tag_pattern
  679. self._right_context_tag_pattern = right_context_tag_pattern
  680. regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' %
  681. (tag_pattern2re_pattern(left_context_tag_pattern),
  682. tag_pattern2re_pattern(chunk_tag_pattern),
  683. tag_pattern2re_pattern(right_context_tag_pattern),
  684. ChunkString.IN_CHINK_PATTERN))
  685. replacement = r'\g<left>{\g<chunk>}\g<right>'
  686. RegexpChunkRule.__init__(self, regexp, replacement, descr)
  687. def __repr__(self):
  688. """
  689. @rtype: C{string}
  690. @return: A string representation of this rule. This
  691. string representation has the form::
  692. <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
  693. Note that this representation does not include the
  694. description string; that string can be accessed
  695. separately with the C{descr} method.
  696. """
  697. return '<ChunkRuleWithContext: %r, %r, %r>' % (
  698. self._left_context_tag_pattern, self._chunk_tag_pattern,
  699. self._right_context_tag_pattern)
  700. ##//////////////////////////////////////////////////////
  701. ## Tag Pattern Format Conversion
  702. ##//////////////////////////////////////////////////////
  703. # this should probably be made more strict than it is -- e.g., it
  704. # currently accepts 'foo'.
  705. CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
  706. ('[^\{\}<>]+',
  707. '[^\{\}<>]+'))
  708. def tag_pattern2re_pattern(tag_pattern):
  709. """
  710. Convert a tag pattern to a regular expression pattern. A X{tag
  711. pattern} is a modified version of a regular expression, designed
  712. for matching sequences of tags. The differences between regular
  713. expression patterns and tag patterns are:
  714. - In tag patterns, C{'<'} and C{'>'} act as parentheses; so
  715. C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not
  716. C{'<NN'} followed by one or more repetitions of C{'>'}.
  717. - Whitespace in tag patterns is ignored. So
  718. C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'}
  719. - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so
  720. C{'<NN.*>'} matches any single tag starting with C{'NN'}.
  721. In particular, C{tag_pattern2re_pattern} performs the following
  722. transformations on the given pattern:
  723. - Replace '.' with '[^<>{}]'
  724. - Remove any whitespace
  725. - Add extra parens around '<' and '>', to make '<' and '>' act
  726. like parentheses. E.g., so that in '<NN>+', the '+' has scope
  727. over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
  728. scope over 'NN' and 'IN', but not '<' or '>'.
  729. - Check to make sure the resulting pattern is valid.
  730. @type tag_pattern: C{string}
  731. @param tag_pattern: The tag pattern to convert to a regular
  732. expression pattern.
  733. @raise ValueError: If C{tag_pattern} is not a valid tag pattern.
  734. In particular, C{tag_pattern} should not include braces; and it
  735. should not contain nested or mismatched angle-brackets.
  736. @rtype: C{string}
  737. @return: A regular expression pattern corresponding to
  738. C{tag_pattern}.
  739. """
  740. # Clean up the regular expression
  741. tag_pattern = re.sub(r'\s', '', tag_pattern)
  742. tag_pattern = re.sub(r'<', '(<(', tag_pattern)
  743. tag_pattern = re.sub(r'>', ')>)', tag_pattern)
  744. # Check the regular expression
  745. if not CHUNK_TAG_PATTERN.match(tag_pattern):
  746. raise ValueError('Bad tag pattern: %r' % tag_pattern)
  747. # Replace "." with CHUNK_TAG_CHAR.
  748. # We have to do this after, since it adds {}[]<>s, which would
  749. # confuse CHUNK_TAG_PATTERN.
  750. # PRE doesn't have lookback assertions, so reverse twice, and do
  751. # the pattern backwards (with lookahead assertions). This can be
  752. # made much cleaner once we can switch back to SRE.
  753. def reverse_str(str):
  754. lst = list(str)
  755. lst.reverse()
  756. return ''.join(lst)
  757. tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
  758. reversed = reverse_str(tag_pattern)
  759. reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
  760. tag_pattern = reverse_str(reversed)
  761. return tag_pattern
  762. ##//////////////////////////////////////////////////////
  763. ## RegexpChunkParser
  764. ##//////////////////////////////////////////////////////
  765. class RegexpChunkParser(ChunkParserI):
  766. """
  767. A regular expression based chunk parser. C{RegexpChunkParser} uses a
  768. sequence of X{rules} to find chunks of a single type within a
  769. text. The chunking of the text is encoded using a C{ChunkString},
  770. and each rule acts by modifying the chunking in the
  771. C{ChunkString}. The rules are all implemented using regular
  772. expression matching and substitution.
  773. The C{RegexpChunkRule} class and its subclasses (C{ChunkRule},
  774. C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule})
  775. define the rules that are used by C{RegexpChunkParser}. Each rule
  776. defines an C{apply} method, which modifies the chunking encoded
  777. by a given C{ChunkString}.
  778. @type _rules: C{list} of C{RegexpChunkRule}
  779. @ivar _rules: The list of rules that should be applied to a text.
  780. @type _trace: C{int}
  781. @ivar _trace: The default level of tracing.
  782. """
  783. def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):
  784. """
  785. Construct a new C{RegexpChunkParser}.
  786. @type rules: C{list} of C{RegexpChunkRule}
  787. @param rules: The sequence of rules that should be used to
  788. generate the chunking for a tagged text.
  789. @type chunk_node: C{string}
  790. @param chunk_node: The node value that should be used for
  791. chunk subtrees. This is typically a short string
  792. describing the type of information contained by the chunk,
  793. such as C{"NP"} for base noun phrases.
  794. @type top_node: C{string}
  795. @param top_node: The node value that should be used for the
  796. top node of the chunk structure.
  797. @type trace: C{int}
  798. @param trace: The level of tracing that should be used when
  799. parsing a text. C{0} will generate no tracing output;
  800. C{1} will generate normal tracing output; and C{2} or
  801. higher will generate verbose tracing output.
  802. """
  803. self._rules = rules
  804. self._trace = trace
  805. self._chunk_node = chunk_node
  806. self._top_node = top_node
  807. def _trace_apply(self, chunkstr, verbose):
  808. """
  809. Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
  810. turn. Generate trace output between each rule. If C{verbose}
  811. is true, then generate verbose output.
  812. @type chunkstr: C{ChunkString}
  813. @param chunkstr: The chunk string to which each rule should be
  814. applied.
  815. @type verbose: C{boolean}
  816. @param verbose: Whether output should be verbose.
  817. @rtype: C{None}
  818. """
  819. print '# Input:'
  820. print chunkstr
  821. for rule in self._rules:
  822. rule.apply(chunkstr)
  823. if verbose:
  824. print '#', rule.descr()+' ('+`rule`+'):'
  825. else:
  826. print '#', rule.descr()+':'
  827. print chunkstr
  828. def _notrace_apply(self, chunkstr):
  829. """
  830. Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
  831. turn.
  832. @param chunkstr: The chunk string to which each rule should be
  833. applied.
  834. @type chunkstr: C{ChunkString}
  835. @rtype: C{None}
  836. """
  837. for rule in self._rules:
  838. rule.apply(chunkstr)
  839. def parse(self, chunk_struct, trace=None):
  840. """
  841. @type chunk_struct: C{Tree}
  842. @param chunk_struct: the chunk structure to be (further) chunked
  843. @type trace: C{int}
  844. @param trace: The level of tracing that should be used when
  845. parsing a text. C{0} will generate no tracing output;
  846. C{1} will generate normal tracing output; and C{2} or
  847. highter will generate verbose tracing output. This value
  848. overrides the trace level value that was given to the
  849. constructor.
  850. @rtype: C{Tree}
  851. @return: a chunk structure that encodes the chunks in a given
  852. tagged sentence. A chunk is a non-overlapping linguistic
  853. group, such as a noun phrase. The set of chunks
  854. identified in the chunk structure depends on the rules
  855. used to define this C{RegexpChunkParser}.
  856. """
  857. if len(chunk_struct) == 0:
  858. print 'Warning: parsing empty text'
  859. return Tree(self._top_node, [])
  860. try:
  861. chunk_struct.node
  862. except AttributeError:
  863. chunk_struct = Tree(self._top_node, chunk_struct)
  864. # Use the default trace value?
  865. if trace == None: trace = self._trace
  866. chunkstr = ChunkString(chunk_struct)
  867. # Apply the sequence of rules to the chunkstring.
  868. if trace:
  869. verbose = (trace>1)
  870. self._trace_apply(chunkstr, verbose)
  871. else:
  872. self._notrace_apply(chunkstr)
  873. # Use the chunkstring to create a chunk structure.
  874. return chunkstr.to_chunkstruct(self._chunk_node)
  875. def rules(self):
  876. """
  877. @return: the sequence of rules used by C{RegexpChunkParser}.
  878. @rtype: C{list} of C{RegexpChunkRule}
  879. """
  880. return self._rules
  881. def __repr__(self):
  882. """
  883. @return: a concise string representation of this
  884. C{RegexpChunkParser}.
  885. @rtype: C{string}
  886. """
  887. return "<RegexpChunkParser with %d rules>" % len(self._rules)
  888. def __str__(self):
  889. """
  890. @return: a verbose string representation of this C{RegexpChunkParser}.
  891. @rtype: C{string}
  892. """
  893. s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
  894. margin = 0
  895. for rule in self._rules:
  896. margin = max(margin, len(rule.descr()))
  897. if margin < 35:
  898. format = " %" + `-(margin+3)` + "s%s\n"
  899. else:
  900. format = " %s\n %s\n"
  901. for rule in self._rules:
  902. s += format % (rule.descr(), `rule`)
  903. return s[:-1]
  904. ##//////////////////////////////////////////////////////
  905. ## Chunk Grammar
  906. ##//////////////////////////////////////////////////////
  907. class RegexpParser(ChunkParserI):
  908. """
  909. A grammar based chunk parser. C{chunk.RegexpParser} uses a set of
  910. regular expression patterns to specify the behavior of the parser.
  911. The chunking of the text is encoded using a C{ChunkString}, and
  912. each rule acts by modifying the chunking in the C{ChunkString}.
  913. The rules are all implemented using regular expression matching
  914. and substitution.
  915. A grammar contains one or more clauses in the following form::
  916. NP:
  917. {<DT|JJ>} # chunk determiners and adjectives
  918. }<[\.VI].*>+{ # chink any tag beginning with V, I, or .
  919. <.*>}{<DT> # split a chunk at a determiner
  920. <DT|JJ>{}<NN.*> # merge chunk ending with det/adj
  921. # with one starting with a noun
  922. The patterns of a clause are executed in order. An earlier
  923. pattern may introduce a chunk boundary that prevents a later
  924. pattern from executing. Sometimes an individual pattern will
  925. match on multiple, overlapping extents of the input. As with
  926. regular expression substitution more generally, the chunker will
  927. identify the first match possible, then continue looking for matches
  928. after this one has ended.
  929. The clauses of a grammar are also executed in order. A cascaded
  930. chunk parser is one having more than one clause. The maximum depth
  931. of a parse tree created by this chunk parser is the same as the
  932. number of clauses in the grammar.
  933. When tracing is turned on, the comment portion of a line is displayed
  934. each time the corresponding pattern is applied.
  935. @type _start: C{string}
  936. @ivar _start: The start symbol of the grammar (the root node of
  937. resulting trees)
  938. @type _stages: C{int}
  939. @ivar _stages: The list of parsing stages corresponding to the grammar
  940. """
  941. def __init__(self, grammar, top_node='S', loop=1, trace=0):
  942. """
  943. Create a new chunk parser, from the given start state
  944. and set of chunk patterns.
  945. @param grammar: The grammar, or a list of RegexpChunkParser objects
  946. @type grammar: C{string} or C{list} of C{RegexpChunkParser}
  947. @param top_node: The top node of the tree being created
  948. @type top_node: C{string} or C{Nonterminal}
  949. @param loop: The number of times to run through the patterns
  950. @type loop: C{int}
  951. @type trace: C{int}
  952. @param trace: The level of tracing that should be used when
  953. parsing a text. C{0} will generate no tracing output;
  954. C{1} will generate normal tracing output; and C{2} or
  955. higher will generate verbose tracing output.
  956. """
  957. self._trace = trace
  958. self._stages = []
  959. self._grammar = grammar
  960. self._loop = loop
  961. if isinstance(grammar, basestring):
  962. self._parse_grammar(grammar, top_node, trace)
  963. else:
  964. # Make sur the grammar looks like it has the right type:
  965. type_err = ('Expected string or list of RegexpChunkParsers '
  966. 'for the grammar.')
  967. try: grammar = list(grammar)
  968. except: raise TypeError(type_err)
  969. for elt in grammar:
  970. if not isinstance(elt, RegexpChunkParser):
  971. raise TypeError(type_err)
  972. self._stages = grammar
  973. def _parse_grammar(self, grammar, top_node, trace):
  974. """
  975. Helper function for __init__: parse the grammar if it is a
  976. string.
  977. """
  978. rules = []
  979. lhs = None
  980. for line in grammar.split('\n'):
  981. line = line.strip()
  982. # New stage begins if there's an unescaped ':'
  983. m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
  984. if m:
  985. # Record the stage that we just completed.
  986. self._add_stage(rules, lhs, top_node, trace)
  987. # Start a new stage.
  988. lhs = m.group('nonterminal').strip()
  989. rules = []
  990. line = m.group('rule').strip()
  991. # Skip blank & comment-only lines
  992. if line=='' or line.startswith('#'): continue
  993. # Add the rule
  994. rules.append(RegexpChunkRule.parse(line))
  995. # Record the final stage
  996. self._add_stage(rules, lhs, top_node, trace)
  997. def _add_stage(self, rules, lhs, top_node, trace):
  998. """
  999. Helper function for __init__: add a new stage to the parser.
  1000. """
  1001. if rules != []:
  1002. if not lhs:
  1003. raise ValueError('Expected stage marker (eg NP:)')
  1004. parser = RegexpChunkParser(rules, chunk_node=lhs,
  1005. top_node=top_node, trace=trace)
  1006. self._stages.append(parser)
  1007. def parse(self, chunk_struct, trace=None):
  1008. """
  1009. Apply the chunk parser to this input.
  1010. @type chunk_struct: C{Tree}
  1011. @param chunk_struct: the chunk structure to be (further) chunked
  1012. (this tree is modified, and is also returned)
  1013. @type trace: C{int}
  1014. @param trace: The level of tracing that should be used when
  1015. parsing a text. C{0} will generate no tracing output;
  1016. C{1} will generate normal tracing output; and C{2} or
  1017. highter will generate verbose tracing output. This value
  1018. overrides the trace level value that was given to the
  1019. constructor.
  1020. @return: the chunked output.
  1021. @rtype: C{Tree}
  1022. """
  1023. if trace == None: trace = self._trace
  1024. for i in range(self._loop):
  1025. for parser in self._stages:
  1026. chunk_struct = parser.parse(chunk_struct, trace=trace)
  1027. return chunk_struct
  1028. def __repr__(self):
  1029. """
  1030. @return: a concise string representation of this C{chunk.RegexpParser}.
  1031. @rtype: C{string}
  1032. """
  1033. return "<chunk.RegexpParser with %d stages>" % len(self._stages)
  1034. def __str__(self):
  1035. """
  1036. @return: a verbose string representation of this
  1037. C{RegexpChunkParser}.
  1038. @rtype: C{string}
  1039. """
  1040. s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
  1041. margin = 0
  1042. for parser in self._stages:
  1043. s += parser.__str__() + "\n"
  1044. return s[:-1]
  1045. ##//////////////////////////////////////////////////////
  1046. ## Demonstration code
  1047. ##//////////////////////////////////////////////////////
  1048. def demo_eval(chunkparser, text):
  1049. """
  1050. Demonstration code for evaluating a chunk parser, using a
  1051. C{ChunkScore}. This function assumes that C{text} contains one
  1052. sentence per line, and that each sentence has the form expected by
  1053. C{tree.chunk}. It runs the given chunk parser on each sentence i

Large files files are truncated, but you can click here to view the full file