PageRenderTime 54ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/pl196x.py

https://github.com/BrucePHill/nltk
Python | 279 lines | 257 code | 14 blank | 8 comment | 6 complexity | 45474029a206be3cd4a74c229a7713e8 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit:
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import os
  8. import re
  9. from nltk import compat
  10. from nltk import tokenize, tree
  11. from .util import *
  12. from .api import *
  13. from .xmldocs import XMLCorpusReader
  14. # (?:something) -- non-grouping parentheses!
  15. PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
  16. SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
  17. TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
  18. WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
  19. TYPE = re.compile(r'type="(.*?)"')
  20. ANA = re.compile(r'ana="(.*?)"')
  21. TEXTID = re.compile(r'text id="(.*?)"')
  22. class TEICorpusView(StreamBackedCorpusView):
  23. def __init__(self, corpus_file,
  24. tagged, group_by_sent, group_by_para,
  25. tag_mapping_function=None, headLen=0,
  26. textids=None):
  27. self._tagged = tagged
  28. self._textids = textids
  29. self._group_by_sent = group_by_sent
  30. self._group_by_para = group_by_para
  31. # WARNING -- skip header
  32. StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen)
  33. _pagesize = 4096
  34. def read_block(self, stream):
  35. block = stream.readlines(self._pagesize)
  36. block = concat(block)
  37. while (block.count('<text id') > block.count('</text>')) \
  38. or block.count('<text id') == 0:
  39. tmp = stream.readline()
  40. if len(tmp) <= 0:
  41. break
  42. block += tmp
  43. block = block.replace('\n','')
  44. textids = TEXTID.findall(block)
  45. if self._textids:
  46. for tid in textids:
  47. if tid not in self._textids:
  48. beg = block.find(tid)-1
  49. end = block[beg: ].find('</text>')+len('</text>')
  50. block = block[ :beg]+block[beg+end: ]
  51. output = []
  52. for para_str in PARA.findall(block):
  53. para = []
  54. for sent_str in SENT.findall(para_str):
  55. if not self._tagged:
  56. sent = WORD.findall(sent_str)
  57. else:
  58. sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
  59. if self._group_by_sent:
  60. para.append(sent)
  61. else:
  62. para.extend(sent)
  63. if self._group_by_para:
  64. output.append(para)
  65. else:
  66. output.extend(para)
  67. return output
  68. def _parse_tag(self, tag_word_tuple):
  69. (tag, word) = tag_word_tuple
  70. if tag.startswith('w'):
  71. tag = ANA.search(tag).group(1)
  72. else: # tag.startswith('c')
  73. tag = TYPE.search(tag).group(1)
  74. return (word, tag)
  75. class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
  76. headLen = 2770
  77. def __init__(self, *args, **kwargs):
  78. if 'textid_file' in kwargs: self._textids = kwargs['textid_file']
  79. else: self._textids = None
  80. XMLCorpusReader.__init__(self, *args)
  81. CategorizedCorpusReader.__init__(self, kwargs)
  82. self._init_textids()
  83. def _init_textids(self):
  84. self._f2t = defaultdict(list)
  85. self._t2f = defaultdict(list)
  86. if self._textids is not None:
  87. for line in self.open(self._textids).readlines():
  88. line = line.strip()
  89. file_id, text_ids = line.split(' ', 1)
  90. if file_id not in self.fileids():
  91. raise ValueError('In text_id mapping file %s: %s '
  92. 'not found' % (catfile, file_id))
  93. for text_id in text_ids.split(self._delimiter):
  94. self._add_textids(file_id, text_id)
  95. def _add_textids(self, file_id, text_id):
  96. self._f2t[file_id].append(text_id)
  97. self._t2f[text_id].append(file_id)
  98. def _resolve(self, fileids, categories, textids=None):
  99. tmp = None
  100. if fileids is not None:
  101. if not tmp:
  102. tmp = fileids, None
  103. else:
  104. raise ValueError('Specify only fileids, categories or textids')
  105. if categories is not None:
  106. if not tmp:
  107. tmp = self.fileids(categories), None
  108. else:
  109. raise ValueError('Specify only fileids, categories or textids')
  110. if textids is not None:
  111. if not tmp:
  112. if isinstance(textids, compat.string_types): textids = [textids]
  113. files = sum((self._t2f[t] for t in textids), [])
  114. tdict = dict()
  115. for f in files:
  116. tdict[f] = (set(self._f2t[f]) & set(textids))
  117. tmp = files, tdict
  118. else:
  119. raise ValueError('Specify only fileids, categories or textids')
  120. return None, None
  121. def decode_tag(self, tag):
  122. # to be implemented
  123. return tag
  124. def textids(self, fileids=None, categories=None):
  125. """
  126. In the pl196x corpus each category is stored in single
  127. file and thus both methods provide identical functionality. In order
  128. to accommodate finer granularity, a non-standard textids() method was
  129. implemented. All the main functions can be supplied with a list
  130. of required chunks---giving much more control to the user.
  131. """
  132. fileids, _ = self._resolve(fileids, categories)
  133. if fileids is None: return sorted(self._t2f)
  134. if isinstance(fileids, compat.string_types):
  135. fileids = [fileids]
  136. return sorted(sum((self._f2t[d] for d in fileids), []))
  137. def words(self, fileids=None, categories=None, textids=None):
  138. fileids, textids = self._resolve(fileids, categories, textids)
  139. if fileids is None: fileids = self._fileids
  140. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  141. if textids:
  142. return concat([TEICorpusView(self.abspath(fileid),
  143. False, False, False,
  144. headLen=self.headLen,
  145. textids=textids[fileid])
  146. for fileid in fileids])
  147. else:
  148. return concat([TEICorpusView(self.abspath(fileid),
  149. False, False, False,
  150. headLen=self.headLen)
  151. for fileid in fileids])
  152. def sents(self, fileids=None, categories=None, textids=None):
  153. fileids, textids = self._resolve(fileids, categories, textids)
  154. if fileids is None: fileids = self._fileids
  155. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  156. if textids:
  157. return concat([TEICorpusView(self.abspath(fileid),
  158. False, True, False,
  159. headLen=self.headLen,
  160. textids=textids[fileid])
  161. for fileid in fileids])
  162. else:
  163. return concat([TEICorpusView(self.abspath(fileid),
  164. False, True, False,
  165. headLen=self.headLen)
  166. for fileid in fileids])
  167. def paras(self, fileids=None, categories=None, textids=None):
  168. fileids, textids = self._resolve(fileids, categories, textids)
  169. if fileids is None: fileids = self._fileids
  170. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  171. if textids:
  172. return concat([TEICorpusView(self.abspath(fileid),
  173. False, True, True,
  174. headLen=self.headLen,
  175. textids=textids[fileid])
  176. for fileid in fileids])
  177. else:
  178. return concat([TEICorpusView(self.abspath(fileid),
  179. False, True, True,
  180. headLen=self.headLen)
  181. for fileid in fileids])
  182. def tagged_words(self, fileids=None, categories=None, textids=None):
  183. fileids, textids = self._resolve(fileids, categories, textids)
  184. if fileids is None: fileids = self._fileids
  185. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  186. if textids:
  187. return concat([TEICorpusView(self.abspath(fileid),
  188. True, False, False,
  189. headLen=self.headLen,
  190. textids=textids[fileid])
  191. for fileid in fileids])
  192. else:
  193. return concat([TEICorpusView(self.abspath(fileid),
  194. True, False, False,
  195. headLen=self.headLen)
  196. for fileid in fileids])
  197. def tagged_sents(self, fileids=None, categories=None, textids=None):
  198. fileids, textids = self._resolve(fileids, categories, textids)
  199. if fileids is None: fileids = self._fileids
  200. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  201. if textids:
  202. return concat([TEICorpusView(self.abspath(fileid),
  203. True, True, False,
  204. headLen=self.headLen,
  205. textids=textids[fileid])
  206. for fileid in fileids])
  207. else:
  208. return concat([TEICorpusView(self.abspath(fileid),
  209. True, True, False,
  210. headLen=self.headLen)
  211. for fileid in fileids])
  212. def tagged_paras(self, fileids=None, categories=None, textids=None):
  213. fileids, textids = self._resolve(fileids, categories, textids)
  214. if fileids is None: fileids = self._fileids
  215. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  216. if textids:
  217. return concat([TEICorpusView(self.abspath(fileid),
  218. True, True, True,
  219. headLen=self.headLen,
  220. textids=textids[fileid])
  221. for fileid in fileids])
  222. else:
  223. return concat([TEICorpusView(self.abspath(fileid),
  224. True, True, True,
  225. headLen=self.headLen)
  226. for fileid in fileids])
  227. def xml(self, fileids=None, categories=None):
  228. fileids, _ = self._resolve(fileids, categories)
  229. if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0])
  230. else: raise TypeError('Expected a single file')
  231. def raw(self, fileids=None, categories=None):
  232. fileids, _ = self._resolve(fileids, categories)
  233. if fileids is None: fileids = self._fileids
  234. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  235. return concat([self.open(f).read() for f in fileids])