PageRenderTime 50ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk-old/src/nltk/corpus/tree.py

http://nltk.googlecode.com/
Python | 352 lines | 286 code | 19 blank | 47 comment | 25 complexity | d1f55a3f8d98948bdf284d4f6df7edff MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit: Tree Corpus Reader
  2. #
  3. # Copyright (C) 2001 University of Pennsylvania
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # URL: <http://nltk.sf.net>
  6. # For license information, see LICENSE.TXT
  7. #
  8. # $Id: tree.py 2121 2004-08-18 10:49:27Z stevenbird $
  9. import os.path, re
  10. from nltk.corpus import CorpusReaderI, get_basedir
  11. from nltk.tokenreader import *
  12. class TreebankCorpusReader(CorpusReaderI):
  13. """
  14. A corpus reader implementation for the Treebank.
  15. """
  16. # Default token readers.
  17. _ws_reader = WhitespaceSeparatedTokenReader(SUBTOKENS='WORDS')
  18. _prd_reader = TreebankFileTokenReader(SUBTOKENS='WORDS')
  19. _mrg_reader = TreebankFileTokenReader(preterminal_tags=True,
  20. SUBTOKENS='WORDS', TAG='POS')
  21. _tag_reader = TreebankTaggedTokenReader(SUBTOKENS='WORDS', TAG='POS')
  22. def __init__(self, name, rootdir, treebank_2=False,
  23. description_file=None, license_file=None,
  24. copyright_file=None):
  25. self._name = name
  26. self._original_rootdir = rootdir
  27. self._description_file = description_file
  28. self._license_file = license_file
  29. self._copyright_file = copyright_file
  30. if treebank_2:
  31. # 3 groups:
  32. self._groups = ('tagged', 'parsed', 'combined')
  33. self._group_directory = {
  34. 'tagged':'tagged/pos', 'parsed':'parsed/prd',
  35. 'combined':'combined/mrg' }
  36. self._group_mask = { 'tagged':r'.*\.pos',
  37. 'parsed':r'.*\.prd', 'combined':'.*\.mrg' }
  38. else:
  39. # 4 groups:
  40. self._groups = ('raw', 'tagged', 'parsed', 'combined')
  41. self._group_directory = dict([(g, g) for g in self._groups])
  42. self._group_mask = dict([(g, r'.*') for g in self._groups])
  43. # Are the merged items "virtual" (i.e., constructed on the
  44. # fly from the parsed & tagged items)? This is true iff the
  45. # treebank corpus doesn't contain a "combined" subdirectory.
  46. self._virtual_merged = 0
  47. # Postpone actual initialization until the corpus is accessed;
  48. # this gives the user a chance to call set_basedir(), and
  49. # prevents "import nltk.corpus" from raising an exception.
  50. # We'll also want to re-initialize the corpus if basedir
  51. # ever changes.
  52. self._basedir = None
  53. self._description = None
  54. self._license = None
  55. self._copyright = None
  56. self._items = None
  57. self._group_items = None
  58. self._initialized = False
  59. #////////////////////////////////////////////////////////////
  60. #// Initialization
  61. #////////////////////////////////////////////////////////////
  62. def _initialize(self):
  63. "Make sure that we're initialized."
  64. # If we're already initialized, then do nothing.
  65. if self._initialized: return
  66. # Make sure the corpus is installed.
  67. basedir = get_basedir()
  68. if not os.path.isabs(self._original_rootdir):
  69. if not os.path.isdir(os.path.join(basedir, self._original_rootdir)):
  70. raise IOError('%s is not installed' % self._name)
  71. self._basedir = basedir
  72. self._rootdir = os.path.join(basedir, self._original_rootdir)
  73. else:
  74. if not os.path.isdir(self._original_rootdir):
  75. raise IOError('%s is not installed' % self._name)
  76. self._basedir = '' # empty
  77. self._rootdir = self._original_rootdir
  78. # # Check the directory for 'merged', and change it to
  79. # # 'combined' if appropriate.
  80. # if 'merged' in self._groups:
  81. # if os.path.isdir(os.path.join(self._rootdir, 'combined')):
  82. # self._group_directory['merged'] = 'combined'
  83. # Get the list of items in each group.
  84. self._group_items = {}
  85. for group in self._groups:
  86. self._find_items(group)
  87. if not self._group_items.has_key('combined'):
  88. self._virtual_merged = 1
  89. self._find_virtual_merged_items()
  90. # Get the overall list of items
  91. self._items = []
  92. for items in self._group_items.values():
  93. self._items += items
  94. # Read metadata from files
  95. if self._description is None and self._description_file is not None:
  96. path = os.path.join(self._rootdir, self._description_file)
  97. self._description = open(path).read()
  98. if self._license is None and self._license_file is not None:
  99. path = os.path.join(self._rootdir, self._license_file)
  100. self._license = open(path).read()
  101. if self._copyright is None and self._copyright_file is not None:
  102. path = os.path.join(self._rootdir, self._copyright_file)
  103. self._copyright = open(path).read()
  104. self._initialized = True
  105. def _find_items(self, group):
  106. directory = self._group_directory.get(group)
  107. mask = self._group_mask.get(group)
  108. if directory:
  109. self._group_items[group] = []
  110. path = os.path.join(self._rootdir, directory)
  111. for dir_path, dir_names, file_names in os.walk(path):
  112. for file_name in file_names:
  113. if re.match(mask + r'$', file_name) and \
  114. not file_name.startswith('readme'):
  115. self._group_items[group].append(
  116. os.path.join(group, file_name))
  117. # os.path.join(dir_path, file_name))
  118. def _find_virtual_merged_items(self):
  119. # Check to make sure we have both the .tagged and the .parsed files.
  120. self._group_items['combined'] = merged = []
  121. is_tagged = {}
  122. for item in self._group_items.get('tagged', []):
  123. basename = os.path.basename(item).split('.')[0]
  124. is_tagged[basename] = 1
  125. for item in self._group_items.get('parsed', []):
  126. basename = os.path.basename(item).split('.')[0]
  127. if is_tagged.get(basename):
  128. merged.append(os.path.join('combined', '%s.mrg' % basename))
  129. #////////////////////////////////////////////////////////////
  130. #// Corpus Information/Metadata
  131. #////////////////////////////////////////////////////////////
  132. def name(self):
  133. return self._name
  134. def description(self):
  135. self._initialize()
  136. return self._description
  137. def license(self):
  138. self._initialize()
  139. return self._license
  140. def copyright(self):
  141. self._initialize()
  142. return self._copyright
  143. def installed(self):
  144. try: self._initialize()
  145. except IOError: return 0
  146. return 1
  147. def rootdir(self):
  148. """
  149. @return: The path to the root directory for this corpus.
  150. @rtype: C{string}
  151. """
  152. self._initialize()
  153. return self._rootdir
  154. #////////////////////////////////////////////////////////////
  155. #// Data access (items)
  156. #////////////////////////////////////////////////////////////
  157. def items(self, group=None):
  158. self._initialize()
  159. if group is None: return self._items
  160. else: return tuple(self._group_items.get(group)) or ()
  161. def read(self, item, *reader_args, **reader_kwargs):
  162. source = '%s/%s' % (self._name, item)
  163. text = self.raw_read(item)
  164. reader = self._token_reader(item)
  165. return reader.read_token(text, source=source,
  166. *reader_args, **reader_kwargs)
  167. def xread(self, item, *reader_args, **reader_kwargs):
  168. # Default: no iterators.
  169. return self.read(item, *reader_args, **reader_kwargs)
  170. def path(self, item):
  171. self._initialize()
  172. if self._virtual_merged and item.startswith('combined'):
  173. estr = 'The given item is virtual; it has no path'
  174. raise NotImplementedError, estr
  175. else:
  176. return os.path.join(self._rootdir, item)
  177. def open(self, item):
  178. return open(self.path(item))
  179. def raw_read(self, item):
  180. if self._virtual_merged and item.startswith('combined'):
  181. basename = os.path.basename(item).split('.')[0]
  182. tagged_item = os.path.join('tagged', '%s.pos' % basename)
  183. parsed_item = os.path.join('parsed', '%s.prd' % basename)
  184. tagged = self.read(tagged_item)
  185. parsed = self.read(parsed_item)
  186. return self.merge(tagged, parsed)
  187. else:
  188. return self.open(item).read()
  189. def _token_reader(self, item):
  190. self._initialize()
  191. if item in self._group_items['combined']:
  192. return self._mrg_reader
  193. elif item in self._group_items['tagged']:
  194. return self._tag_reader
  195. elif item in self._group_items['parsed']:
  196. return self._prd_reader
  197. elif item in self._group_items['raw']:
  198. return self._ws_reader
  199. else:
  200. raise ValueError, 'Unknown item %r' % (item,)
  201. #////////////////////////////////////////////////////////////
  202. #// Parsed/Tagged Merging
  203. #////////////////////////////////////////////////////////////
  204. def merge(self, tagged, parsed):
  205. """
  206. Create a merged treebank file (containing both parse and
  207. part-of-speech tagging information), given the parsed contents
  208. and the part-of-speech tagged contents for that file.
  209. This merge procedure is somewhat robust. In particular:
  210. - It handles brace conversions (eg C{'('} -> C{'-LRB-'}). It
  211. also accepts the (incorrect?) variants C{'*LRB*'} etc., and
  212. automatically convers the to the standard C{'-LRB-'} forms.
  213. - It complains but does not fail if the parse file drops
  214. the last word or the last quote mark.
  215. - It handles traces & other null elements in the parse.
  216. - It handles extra elements in the parse that are not present
  217. in the tagged text. (E.g. in C{'(WHP-1 0)'}.
  218. This is enough robustness to handle wsj_0001 through wsj_0099;
  219. It hasn't yet been tested on the rest of the treebank.
  220. @param tagged: The part-of-speech tagged contents of the file
  221. to merge.
  222. @type tagged: C{string}
  223. @param parsed: The parse contents of the file to merge.
  224. @type parsed: C{string}
  225. @return: The merged contents of the treebank file.
  226. @rtype: C{string}
  227. @todo: Increase the robustness of this method.
  228. """
  229. # Clean up the tagged contents of the file.
  230. tagged = tagged.replace('[', ' ').replace(']', ' ')
  231. tagged = re.sub('={10,}', '', tagged) # >=10 equals signs
  232. tagged = tagged.replace('{', '-LCB-')
  233. tagged = tagged.replace('}', '-RCB-')
  234. tagged = tagged.replace('(', '-LRB-')
  235. tagged = tagged.replace(')', '-RRB-')
  236. # Divide the tagged contents into a list of words. Reverse
  237. # it, so we can use pop() to remove one word at a time.
  238. self._tagged_words = tagged.split()
  239. # Use re.sub to replace words with tagged words. The regexp
  240. # we're using will only match words, not part-of-speech tags.
  241. # Use a helper method (_merge_tag) to find the replacement for
  242. # each match.
  243. try:
  244. self._mismatches = 0
  245. self._first_mismatch = None
  246. self._tagged_index = 0
  247. merged = re.sub(r'\s([^\s\(\)]+)', self._merge_tag, parsed)
  248. except IndexError:
  249. raise ValueError('Merge failed: more words in the parsed '+
  250. 'contents than in the tagged contents')
  251. # Check that we used all tagged words.
  252. if self._tagged_index != len(self._tagged_words):
  253. if (self._tagged_index == (len(self._tagged_words)-1) and
  254. self._tagged_words[-1] == "''/''"):
  255. print 'Warning: dropped close quote'
  256. elif self._tagged_index == (len(self._tagged_words)-1):
  257. print ('Warning: dropped last word (%r)' %
  258. self._tagged_words[-1])
  259. else:
  260. print self._tagged_index, len(self._tagged_words)
  261. print self._tagged_words[-5:]
  262. raise ValueError('Merge failed: more words in the tagged '+
  263. 'contents than in the parsed contents')
  264. return merged
  265. def _merge_tag(self, match):
  266. """
  267. A helper function for L{merge}, that is used as the C{repl}
  268. argument for a regular expression substitution. Given the
  269. regexp match for a word in the treebank, return the
  270. corresponding tagged word.
  271. """
  272. # Get the next parsed word
  273. parseword = match.group(1)
  274. # Annoying clean-up
  275. if parseword[:1] == '*' and parseword[-1:] == '*':
  276. if re.match(r'\*[LR][CRS]B\*', parseword):
  277. parseword = '-' + parseword[1:-1] + '-'
  278. # Get the next tagged word.
  279. taggedword = self._tagged_words[self._tagged_index]
  280. split = taggedword.rfind('/')
  281. if split == -1:
  282. raise ValueError('Merge failed: untagged word %s' % taggedword)
  283. word = taggedword[:split].replace('\\', '')
  284. tag = taggedword[split+1:]
  285. # If they don't match, then try returning the parse word, and
  286. # continuing.
  287. if word != parseword:
  288. if not parseword.startswith('*'):
  289. self._mismatches += 1
  290. if self._mismatches == 1:
  291. self._first_mismatch = '%r vs. %r' % (word, parseword)
  292. if self._mismatches > 5:
  293. print self._tagged_words[self._tagged_index:
  294. self._tagged_index+5]
  295. raise ValueError("Merge failed: tagged & parsed files "+
  296. "don't match:\n "+ self._first_mismatch)
  297. return word
  298. # If they match, then return the tagged word, expressed as a
  299. # tree constituant.
  300. self._mismatches = 0
  301. self._tagged_index += 1
  302. return ' (%s %s)' % (tag, word)
  303. #////////////////////////////////////////////////////////////
  304. #// Structure access (groups)
  305. #////////////////////////////////////////////////////////////
  306. def groups(self):
  307. return self._groups