PageRenderTime 62ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk_contrib/nltk_contrib/concord.py

http://nltk.googlecode.com/
Python | 867 lines | 798 code | 8 blank | 61 comment | 6 complexity | efb97f390ef98829dee21abc5726d8bf MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit: Concordance System
  2. #
  3. # Copyright (C) 2005 University of Melbourne
  4. # Author: Peter Spiller
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from nltk.corpus import brown
  8. from math import *
  9. import re, string
  10. from nltk.probability import *
  11. class SentencesIndex(object):
  12. """Class implementing an index of a collection of sentences.
  13. Given a list of sentences, where each sentence is a list of words,
  14. this class generates an index of the list. Each word should be a (word, POS
  15. tag) pair. The index is stored as a dictionary, with the hashable items as
  16. keys and a list of (sentence number, word number) tuples as values. This
  17. class also generates a list of sentence lengths.
  18. """
  19. def __init__(self, sentences):
  20. """ Constructor. Takes the list of sentences to index.
  21. @type sentences: list
  22. @param sentences: List of sentences to index. Sentences should be
  23. lists of (string, string) pairs.
  24. """
  25. sentenceCount = 0
  26. self.index = {}
  27. self.lengths = []
  28. # for each sentence:
  29. for sentence in sentences:
  30. # add the sentences length to the list of sentence lengths
  31. self.lengths.append(len(sentence))
  32. wordCount = 0
  33. for word in sentence:
  34. self.index[word] = self.index.get(word, []) + [(sentenceCount, wordCount)]
  35. wordCount += 1
  36. sentenceCount += 1
  37. def getIndex(self):
  38. """ Returns the index dictionary.
  39. @rtype: dictionary
  40. @returns: The dictionary containing the index.
  41. """
  42. return self.index
  43. def getSentenceLengths(self):
  44. """ Returns the list of sentence lengths.
  45. Element 0 is the length of the first sentence, element 1 the second,
  46. etc.
  47. @rtype: list
  48. @returns: List of lengths of sentences.
  49. """
  50. return self.lengths
  51. class IndexConcordance(object):
  52. """ Class that generates concordances from a list of sentences.
  53. Uses an index for efficiency. If a SentencesIndex object is provided,
  54. it will be used, otherwise one will be constructed from the list of
  55. sentences. When generating a concordance, the supplied regular expression
  56. is used to filter the list of words in the index. Any that match are looked
  57. up in the index, and their lists of (sentence number, word number) pairs are
  58. used to extract the correct amount of context from the sentences.
  59. Although this class also allows regular expressions to be specified for the
  60. left and right context, they are not used on the index. If only left/right
  61. regexps are provided, the class will essentially generate a concordance for
  62. every word in the corpus, then filter it with the regexps. This will not be
  63. very efficient and requires very large amounts of memory.
  64. @cvar SORT_WORD: Constant for sorting by target word.
  65. @cvar SORT_POS: Constant for sorting by target word's POS tag.
  66. @cvar SORT_NUM: Constant for sorting by sentence number.
  67. @cvar SORT_RIGHT_CONTEXT: Constant for sorting by the first word of the
  68. right context.
  69. """
  70. # constants for different types of sort
  71. SORT_WORD = 0
  72. SORT_POS = 1
  73. SORT_NUM = 2
  74. SORT_RIGHT_CONTEXT = 3
  75. def __init__(self, sentences, index=None):
  76. """ Constructor.
  77. Arguments:
  78. @type sentences: list
  79. @param sentences: List of sentences to create a concordance for.
  80. Sentences should be lists of (string, string) pairs.
  81. @type index: SentencesIndex
  82. @param index: SentencesIndex object to use as an index. If this is
  83. not provided, one will be generated.
  84. """
  85. self.sentences = sentences
  86. self.index = index
  87. # generate an index if one wasn't provided
  88. if self.index == None:
  89. self.index = SentencesIndex(self.sentences)
  90. def formatted(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None,
  91. leftContextLength=3, rightContextLength=3, contextInSentences=False,
  92. contextChars=50, maxKeyLength=0, showWord=True,
  93. sort=0, showPOS=True, flipWordAndPOS=False, verbose=False):
  94. """Generates and displays keyword-in-context formatted concordance data.
  95. This is a convenience method that combines raw() and display()'s
  96. options. Unless you need raw output, this is probably the most useful
  97. method.
  98. @type leftRegexp: string
  99. @param leftRegexp: Regular expression applied to the left context
  100. to filter output. Defaults to None.
  101. @type middleRegexp: string
  102. @param middleRegexp: Regular expression applied to target word to
  103. filter output. Defaults to ".*" (ie everything).
  104. @type rightRegexp: string
  105. @param rightRegexp: Regular expression applied to the right context
  106. to filter output. Defaults to None.
  107. @type leftContextLength: number
  108. @param leftContextLength: Length of left context. Defaults to 3.
  109. @type rightContextLength: number
  110. @param rightContextLength: Length of right context. Defaults to 3.
  111. @type contextInSentences: number
  112. @param contextInSentences: Determines whether the context lengths
  113. arguments are in words or sentences. If false, the context lengths
  114. are in words - a rightContextLength argument of 2 results in two
  115. words of right context. If true, a rightContextLength argument of 2
  116. results in a right context consisting of the portion of the target
  117. word's sentence to the right of the target, plus the two sentences
  118. to the right of that sentence. Defaults to False.
  119. @type contextChars number
  120. @param contextChars: Amount of context to show. If set to less than
  121. 0, does not limit amount of context shown
  122. (may look ugly). Defaults to 55.
  123. @type maxKeyLength: number
  124. @param maxKeyLength: Max number of characters to show for the
  125. target word. If 0 or less, this value is
  126. calculated so as to fully show all target
  127. words. Defaults to 0.
  128. @type showWord: boolean
  129. @param showWord: Whether to show words. Defaults to True.
  130. @type sort: integer
  131. @param sort: Should be set to one the provided SORT constants. If
  132. SORT_WORD, the output is sorted on the target word. If SORT_POS, the
  133. output is sorted on the target word's POS tag. If SORT_NUM, the
  134. output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the
  135. output is sorted on the first word of the right context. Defaults to
  136. SORT_WORD.
  137. @type showPOS: boolean
  138. @param showPOS: Whether to show POS tags. Defaults to True.
  139. @type flipWordAndPOS: boolean
  140. @param flipWordAndPOS: If true, displays POS tags first instead of
  141. words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False.
  142. @type verbose: boolean
  143. @param verbose: Displays some extra status information. Defaults
  144. to False.
  145. """
  146. self.format(self.raw(leftRegexp, middleRegexp, rightRegexp, leftContextLength,
  147. rightContextLength, contextInSentences, sort, verbose), contextChars,
  148. maxKeyLength, showWord, showPOS, flipWordAndPOS, verbose)
  149. def raw(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None,
  150. leftContextLength=3, rightContextLength=3, contextInSentences=False,
  151. sort=0, verbose=False):
  152. """ Generates and returns raw concordance data.
  153. Regular expressions supplied are evaluated over the appropriate part of
  154. each line of the concordance. For the purposes of evaluating the regexps,
  155. the lists of (word, POS tag) tuples are flattened into a space-separated
  156. list of word/POS tokens (ie the word followed by '/' followed by the POS
  157. tag). A regexp like '^must/.*' matches the word 'must' with any POS tag,
  158. while one like '.*/nn$' matches any word with a POS tag of 'nn'. All
  159. regexps are evaluated over lowercase versions of the text.
  160. @type leftRegexp: string
  161. @param leftRegexp: Regular expression applied to the left context
  162. to filter output. Defaults to None.
  163. @type middleRegexp: string
  164. @param middleRegexp: Regular expression applied to target word to
  165. filter output. Defaults to ".*" (ie everything).
  166. @type rightRegexp: string
  167. @param rightRegexp: Regular expression applied to the right context
  168. to filter output. Defaults to None.
  169. @type leftContextLength: number
  170. @param leftContextLength: Length of left context. Defaults to 3.
  171. @type rightContextLength: number
  172. @param rightContextLength: Length of right context. Defaults to 3.
  173. @type contextInSentences: number
  174. @param contextInSentences: Determines whether the context lengths
  175. arguments are in words or sentences. If false, the context lengths
  176. are in words - a rightContextLength argument of 2 results in two
  177. words of right context. If true, a rightContextLength argument of 2
  178. results in a right context consisting of the portion of the target
  179. word's sentence to the right of the target, plus the two sentences
  180. to the right of that sentence. Defaults to False.
  181. @type sort: integer
  182. @param sort: Should be set to one the provided SORT constants. If
  183. SORT_WORD, the output is sorted on the target word. If SORT_POS, the
  184. output is sorted on the target word's POS tag. If SORT_NUM, the
  185. output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the
  186. output is sorted on the first word of the right context. Defaults to
  187. SORT_WORD.
  188. @type verbose: boolean
  189. @param verbose: Displays some extra status information. Defaults
  190. to False.
  191. @rtype: list
  192. @return: Raw concordance ouput. Returned as a list of
  193. ([left context], target word, [right context], target word
  194. sentence number) tuples.
  195. """
  196. # compile the middle regexp.
  197. reg = re.compile(middleRegexp)
  198. if verbose:
  199. print "Matching the following target words:"
  200. wordLocs = []
  201. # get list of (sentence, word) pairs to get context for
  202. for item in self.index.getIndex().iteritems():
  203. if reg.match("/".join([item[0][0].lower(), item[0][1]])):
  204. if verbose:
  205. print "/".join(item[0])
  206. wordLocs.append(item[1])
  207. print ""
  208. items = []
  209. # if context lengths are specified in words:
  210. if contextInSentences == False:
  211. # for each list of (sentence, word offset in sentence) pairs:
  212. for wordList in wordLocs:
  213. # for each (sentence, word offset in sentence) pair:
  214. for sentenceNum, offset in wordList:
  215. # set pointers to the left- and rightmost sentences to be
  216. # looked at to the sentence the target word is in
  217. leftCorpusIndex = sentenceNum
  218. rightCorpusIndex = sentenceNum
  219. # number of words to include in the left context is
  220. # initially everything in the sentence up to the target
  221. leftLength = offset
  222. # number of words to include in the left context is
  223. # initially everything in the sentence after the target
  224. rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1
  225. # while the length of the left context is less than what we
  226. # need, keep decreasing the left corpus index (ie adding
  227. # sentences to the left context).
  228. while leftLength < leftContextLength:
  229. leftCorpusIndex -= 1
  230. # if the new corpus index would fall off the end of the
  231. # list, stop at 0
  232. if(leftCorpusIndex < 0):
  233. leftCorpusIndex = 0
  234. break
  235. # adjust length and offset
  236. leftLength += self.index.getSentenceLengths()[leftCorpusIndex]
  237. offset += self.index.getSentenceLengths()[leftCorpusIndex]
  238. # while the length of the right context is less than what we
  239. # need, keep increasing the right corpus index (ie adding
  240. # sentences to the right context).
  241. while rightLength < rightContextLength:
  242. rightCorpusIndex += 1
  243. try:
  244. rightLength += self.index.getSentenceLengths()[rightCorpusIndex]
  245. # if the new corpus index falls off the end of the list,
  246. # stop at the end
  247. except IndexError:
  248. rightCorpusIndex -= 1
  249. break
  250. # grab all sentences from the left to right corpus indices,
  251. # then flatten them into a single list of words
  252. sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1]
  253. words = []
  254. for sentence in sents:
  255. for word in sentence:
  256. words.append(word)
  257. # select the appropriate sections of context from the list
  258. # of words
  259. left = words[offset-leftContextLength:offset]
  260. target = words[offset]
  261. right = words[offset+1:offset+1+rightContextLength]
  262. items.append((left, target, right, sentenceNum))
  263. # if context lengths are specified in sentences:
  264. else:
  265. # for each list of (sentence, word offset in sentence) pairs:
  266. for wordList in wordLocs:
  267. # for each list of (sentence, word offset in sentence) pairs:
  268. for sentenceNum, offset in wordList:
  269. # set pointers to the left- and rightmost sentences to be
  270. # looked at to the sentence the target word is in
  271. leftCorpusIndex = sentenceNum
  272. rightCorpusIndex = sentenceNum
  273. # number of words to include in the left context is
  274. # initially everything in the sentence up to the target
  275. leftLength = offset
  276. # number of words to include in the left context is
  277. # initially everything in the sentence after the target
  278. rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1
  279. # keep track of the number of sentences included in the
  280. # left/right context
  281. leftSents = 0;
  282. rightSents = 0;
  283. # while we don't have enough sentences in the left context,
  284. # keep decreasing the left corpus index
  285. while leftSents < leftContextLength:
  286. leftCorpusIndex -= 1
  287. # if the new corpus index would fall off the end of the
  288. # list, stop at 0
  289. if(leftCorpusIndex < 0):
  290. leftCorpusIndex = 0
  291. break
  292. leftLength += self.index.getSentenceLengths()[leftCorpusIndex]
  293. offset += self.index.getSentenceLengths()[leftCorpusIndex]
  294. leftSents += 1
  295. # while we don't have enough sentences in the right context,
  296. # keep increasing the right corpus index
  297. while rightSents < rightContextLength:
  298. rightCorpusIndex += 1
  299. try:
  300. rightLength += self.index.getSentenceLengths()[rightCorpusIndex]
  301. rightSents += 1
  302. # if the new corpus index falls off the end of the list,
  303. # stop at the end
  304. except IndexError:
  305. rightCorpusIndex -= 1
  306. break
  307. # grab all sentences from the left to right corpus indices,
  308. # then flatten them into a single list of words
  309. sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1]
  310. words = []
  311. for sentence in sents:
  312. for word in sentence:
  313. words.append(word)
  314. # select the appropriate sections of context from the list
  315. # of words
  316. left = words[0:offset]
  317. target = words[offset]
  318. right = words[offset+1:]
  319. items.append((left, target, right, sentenceNum))
  320. if verbose:
  321. print "Found %d matches for target word..." % len(items)
  322. # sort the concordance
  323. if sort == self.SORT_WORD:
  324. if verbose:
  325. print "Sorting by target word..."
  326. items.sort(key=lambda i:i[1][0].lower())
  327. elif sort == self.SORT_POS:
  328. if verbose:
  329. print "Sorting by target word POS tag..."
  330. items.sort(key=lambda i:i[1][1].lower())
  331. elif sort == self.SORT_NUM:
  332. if verbose:
  333. print "Sorting by sentence number..."
  334. items.sort(key=lambda i:i[3])
  335. elif sort == self.SORT_RIGHT_CONTEXT:
  336. if verbose:
  337. print "Sorting by first word of right context..."
  338. items.sort(key=lambda i:i[2][0][0])
  339. # if any regular expressions have been given for the context, filter
  340. # the concordance using them
  341. filtered = []
  342. filterBool = False
  343. if leftRegexp != None or rightRegexp != None:
  344. filterBool = True
  345. if filterBool:
  346. leftRe=None
  347. rightRe=None
  348. if leftRegexp != None:
  349. if verbose:
  350. print "Filtering on left context..."
  351. leftRe = re.compile(leftRegexp)
  352. if rightRegexp != None:
  353. if verbose:
  354. print "Filtering on right context..."
  355. rightRe = re.compile(rightRegexp)
  356. for item in items:
  357. if self._matches(item, leftRe, rightRe):
  358. filtered.append(item)
  359. if filterBool:
  360. source = filtered
  361. else:
  362. source = items
  363. return source
  364. def format(self, source, contextChars=55, maxKeyLength=0, showWord=True,
  365. showPOS=True, flipWordAndPOS=False, verbose=False):
  366. """Formats raw concordance output produced by raw().
  367. Displays a concordance in keyword-in-context style format.
  368. @type source: list
  369. @param source: Raw concordance output to format. Expects a list of
  370. ([left context], target word, [right context], target
  371. word sentence number) tuples.
  372. @type contextChars number
  373. @param contextChars: Amount of context to show. If set to less than
  374. 0, does not limit amount of context shown (may look ugly). Defaults to 55.
  375. @type maxKeyLength: number
  376. @param maxKeyLength: Max number of characters to show for the
  377. target word. If 0 or less, this value is
  378. calculated so as to fully show all target
  379. words. Defaults to 0.
  380. @type showWord: boolean
  381. @param showWord: Whether to show words. Defaults to True.
  382. @type showPOS: boolean
  383. @param showPOS: Whether to show POS tags. Defaults to True.
  384. @type flipWordAndPOS: boolean
  385. @param flipWordAndPOS: If true, displays POS tags first instead of
  386. words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False.
  387. @type verbose: boolean
  388. @param verbose: Displays some extra status information. Defaults
  389. to False.
  390. """
  391. # flatten lists of tokens into strings
  392. lines = []
  393. maxMiddleLength = -1
  394. # generate intermediate list of string tuples
  395. for line in source:
  396. # flatten left context tokens into a single string, joining words
  397. # and their POS tag with a '/' (if both are shown).
  398. left = ""
  399. for item in line[0]:
  400. if item[0] == "" and item[1] == "":
  401. left = ""
  402. elif showWord and (not showPOS):
  403. left += item[0] + " "
  404. elif (not showWord) and showPOS:
  405. left += item[1] + " "
  406. elif flipWordAndPOS:
  407. left += item[1] + "/" + item[0] + " "
  408. else:
  409. left += "/".join(item) + " "
  410. # flatten target word into a single string, joining the word and
  411. # its POS tag with a '/' (if both are shown).
  412. if showWord and (not showPOS):
  413. middle = line[1][0]
  414. elif (not showWord) and showPOS:
  415. middle = line[1][1]
  416. elif flipWordAndPOS:
  417. middle = line[1][1] + "/" + line[1][0] + " "
  418. else:
  419. middle = "/".join(line[1])
  420. if len(middle) > maxMiddleLength:
  421. maxMiddleLength = len(middle)
  422. # flatten right context tokens into a single string, joining words
  423. # and their POS tag with a '/' (if both are shown).
  424. right = ""
  425. for item in line[2]:
  426. if item[0] == "" and item[1] == "":
  427. right = ""
  428. elif showWord and (not showPOS):
  429. right += item[0] + " "
  430. elif (not showWord) and showPOS:
  431. right += item[1] + " "
  432. elif flipWordAndPOS:
  433. right += item[1] + "/" + item[0] + " "
  434. else:
  435. right += "/".join(item) + " "
  436. num = line[3]
  437. lines.append((middle, left, right, num))
  438. # crop and justify strings to generate KWIC-format output
  439. count = 0
  440. for middle, left, right, num in lines:
  441. # calculate amount of left padding needed
  442. leftPaddingLength = contextChars - len(left)
  443. if leftPaddingLength < 0:
  444. leftPaddingLength = 0
  445. if len(left) > contextChars and contextChars > -1:
  446. left = left[-contextChars:]
  447. left = " "*leftPaddingLength + left
  448. if contextChars > -1:
  449. right = right[0:contextChars]
  450. # add sentence numbers
  451. left = str(num) + ": " + left[len(str(num))+2 : ]
  452. # calculate amount of middle padding needed
  453. if maxKeyLength > 0:
  454. maxMiddleLength = maxKeyLength
  455. lPad = int(ceil(max(maxMiddleLength - len(middle), 0) / 2.0))
  456. rPad = int(floor(max(maxMiddleLength - len(middle), 0) / 2.0))
  457. middle = " "*lPad + middle + " "*rPad
  458. print left + "| " + middle + " | " + right + " "
  459. count += 1
  460. if verbose:
  461. print "\n" + repr(count) + " lines"
  462. def _matches(self, item, leftRe, rightRe):
  463. """ Private method that runs the given regexps over a raw concordance
  464. item and returns whether they match it.
  465. """
  466. left = item[0]
  467. right = item[2]
  468. # flatten left and right contexts
  469. leftString = ""
  470. for token in left:
  471. leftString += "/".join(token) + " "
  472. rightString = ""
  473. for token in right:
  474. rightString += "/".join(token) + " "
  475. # see if regexps match
  476. ok = True
  477. if leftRe != None and leftRe.match(leftString) == None:
  478. ok = False
  479. if rightRe != None and rightRe.match(rightString) == None:
  480. ok = False
  481. if ok:
  482. return True
  483. else:
  484. return False
  485. class Aggregator(object):
  486. """ Class for aggregating and summarising corpus concordance data.
  487. This class allows one or more sets of concordance data to be summarised and
  488. displayed. This is useful for corpus linguistic tasks like counting the
  489. number of occurences of a particular word and its different POS tags in a
  490. given corpus, or comparing these frequencies across different corpora. It
  491. creates a FreqDist for each set of concordance data, counting how often each
  492. unique entry appears in it.
  493. An example of how to use this class to show the frequency of the five most
  494. common digrams of the form "must/md X/Y" in the Brown Corpus sections a
  495. and g::
  496. concA = IndexConcordance(brown.tagged_sents('a'))
  497. rawA = concA.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1)
  498. concG = IndexConcordance(brown.tagged_sents('g'))
  499. rawG = concG.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1)
  500. agg = Aggregator()
  501. agg.add(rawA, "Brown Corpus A")
  502. agg.add(rawG, "Brown Corpus G")
  503. agg.formatted(showFirstX=5)
  504. Output:
  505. Brown Corpus A
  506. ------------------------------
  507. must/md be/be 17
  508. must/md have/hv 5
  509. must/md not/* 3
  510. must/md play/vb 2
  511. must/md ''/'' 1
  512. Brown Corpus G
  513. ------------------------------
  514. must/md be/be 38
  515. must/md have/hv 21
  516. must/md ,/, 6
  517. must/md not/* 5
  518. must/md always/rb 3
  519. """
  520. # text for 'other' row in output tables
  521. _OTHER_TEXT = "<OTHER>"
  522. # text for 'total' row in output tables
  523. _TOTAL_TEXT = "<TOTAL>"
  524. def __init__(self, inputList=None):
  525. """ Constructor.
  526. @type inputList: list
  527. @param inputList: List of (raw concordance data, name) tuples to be
  528. entered into the aggregator. Defaults to None.
  529. """
  530. self._outputSets = []
  531. if inputList != None:
  532. for (item, n) in inputList:
  533. self.add(item, name=n)
  534. def add(self, raw, name):
  535. """ Adds the given set of raw concordance output to the aggregator.
  536. @type raw: list
  537. @param raw: Raw concordance data (produced by IndexConcordance.raw()).
  538. Expects a list of ([left context], target word,
  539. [right context], target word sentence number) tuples.
  540. @type name: string
  541. @param name: Name to associate with the set of data.
  542. """
  543. self._outputSets.append((raw, name));
  544. def remove(self, name):
  545. """ Removes all sets of raw concordance output with the given name.
  546. @type name: string
  547. @param name: Name of data set to remove.
  548. """
  549. for item in self._outputSets:
  550. if item[1] == name:
  551. self._outputSets.remove(item)
  552. def formatted(self, useWord=True, usePOS=True, normalise=False,
  553. threshold=-1, showFirstX=-1, decimalPlaces=4,
  554. countOther=False, showTotal=False):
  555. """ Displays formatted concordance summary information.
  556. This is a convenience method that combines raw() and display()'s
  557. options. Unless you need raw output, this is probably the most useful
  558. method.
  559. @type useWord: boolean
  560. @param useWord: Include the words in the count. Defaults to True.
  561. @type usePOS: boolean
  562. @param usePOS: Include the POS tags in the count. Defaults to
  563. False.
  564. @type normalise: boolean
  565. @param normalise: If true, normalises the frequencies for each set
  566. of concordance output by dividing each key's frequency by the total
  567. number of samples in that concordances's FreqDist. Allows easier
  568. comparison of results between data sets. Care must be taken when
  569. combining this option with the threshold option, as any threshold
  570. of 1 or more will prevent any output being displayed. Defaults to
  571. False.
  572. @type threshold: number
  573. @param threshold: Frequency display threshold. Results below this
  574. frequency will not be displayed. If less than 0, everything will be
  575. displayed. Defaults to -1.
  576. @type showFirstX: number
  577. @param showFirstX: Only show this many results, starting with the
  578. most frequent. If less than 0, everything will be displayed.
  579. Defaults to -1.
  580. @type decimalPlaces: integer
  581. @param decimalPlaces: Number of decimal places of accuracy to
  582. display. Used when displaying non-integers with the normalise
  583. option. Defaults to 4.
  584. @type countOther: boolean
  585. @param countOther: If true, any samples not shown (due to their
  586. frequency being below the given thershold or because they were
  587. after the number of results specified by the showFirstX argument)
  588. will be combined into one sample. This sample's frequency is the
  589. sum of all unshown sample's frequencies. Defaults to False.
  590. @type showTotal: boolean
  591. @param showTotal: If true, prints the sum of all frequencies (of
  592. the entire FreqDist, not just of the samples displayed.) Defaults
  593. to False.
  594. """
  595. output, maxKeyLength = self.raw(useWord, usePOS)
  596. self.format(output, maxKeyLength, threshold, showFirstX,
  597. decimalPlaces, normalise, countOther, showTotal)
  598. def raw(self, useWord=True, usePOS=True):
  599. """ Generates raw summary information.
  600. Creates a FreqDist for each set of concordance output and uses it to
  601. count the frequency of each line in it. The concordance output is
  602. flattened from lists of tokens to strings, as lists cannot be hashed.
  603. The list of FreqDists is returned, as well as the length of the longest
  604. string (used for formatted display).
  605. @type useWord: boolean
  606. @param useWord: Include the words in the count. Defaults to True.
  607. @type usePOS: boolean
  608. @param usePOS: Include the POS tags in the count. Defaults to
  609. False.
  610. @rtype: list, number
  611. @returns: A list of (FreqDist, name) pairs, and the length of the
  612. longest key in all the FreqDists.
  613. """
  614. output = []
  615. maxKeyLength = 0
  616. # for each set of raw concordance data:
  617. for (rawConcOutput, name) in self._outputSets:
  618. # initialise a FreqDist
  619. dist = FreqDist()
  620. # for each item in the raw concordance output:
  621. for (left, middle, right, num) in rawConcOutput:
  622. # flatten the lists of tokens so they can be hashed in
  623. # the FreqDist
  624. leftList = []
  625. for word in left:
  626. if usePOS == False and useWord == True:
  627. leftList.append(word[0].lower())
  628. elif usePOS == True and useWord == False:
  629. leftList.append(word[1].lower())
  630. else:
  631. leftList.append(word[0].lower() + "/" + word[1].lower())
  632. try:
  633. if usePOS == False and useWord == True:
  634. midString = middle[0].lower()
  635. elif usePOS == True and useWord == False:
  636. midString = middle[1].lower()
  637. else:
  638. midString = middle[0].lower() + "/" + middle[1].lower()
  639. except IndexError:
  640. midString = ""
  641. rightList = []
  642. for word in right:
  643. if usePOS == False and useWord == True:
  644. rightList.append(word[0].lower())
  645. elif usePOS == True and useWord == False:
  646. rightList.append(word[1].lower())
  647. else:
  648. rightList.append(word[0].lower() + "/" + word[1].lower())
  649. # join the tokens together to form a key string
  650. key = string.join(leftList) + " " + midString + " " + string.join(rightList)
  651. # keep track of the longest key length
  652. if len(key) > maxKeyLength:
  653. maxKeyLength = len(key)
  654. # increment the FreqDist's count for this key
  655. dist.inc(key)
  656. # add this FreqDist and name to the output
  657. output.append((dist, name))
  658. # return the output and maximum key length
  659. return output, maxKeyLength
  660. def format(self, output, maxKeyLength=20, threshold=-1, showFirstX=-1,
  661. decimalPlaces=4, normalise=False, countOther=False,
  662. showTotal=False):
  663. """ Displays concordance summary information.
  664. Formats and displays information produced by raw().
  665. @type output: list
  666. @param output: List of (FreqDist, name) pairs (as produced by raw()).
  667. @type maxKeyLength: number
  668. @param maxKeyLength: Length of longest key. Defaults to 20.
  669. @type normalise: boolean
  670. @param normalise: If true, normalises the frequencies for each set
  671. of concordance output by dividing each key's frequency by the total
  672. number of samples in that concordances's FreqDist. Allows easier
  673. comparison of results between data sets. Care must be taken when
  674. combining this option with the threshold option, as any threshold
  675. of 1 or more will prevent any output being displayed. Defaults to
  676. False.
  677. @type threshold: number
  678. @param threshold: Frequency display threshold. Results below this
  679. frequency will not be displayed. If less than 0, everything will be
  680. displayed. Defaults to -1.
  681. @type showFirstX: number
  682. @param showFirstX: Only show this many results, starting with the
  683. most frequent. If less than 0, everything will be displayed.
  684. Defaults to -1.
  685. @type decimalPlaces: integer
  686. @param decimalPlaces: Number of decimal places of accuracy to
  687. display. Used when displaying non-integers with the normalise
  688. option. Defaults to 4.
  689. @type countOther: boolean
  690. @param countOther: If true, any samples not shown (due to their
  691. frequency being below the given thershold or because they were
  692. after the number of results specified by the showFirstX argument)
  693. will be combined into one sample. This sample's frequency is the
  694. sum of all unshown sample's frequencies. Defaults to False.
  695. @type showTotal: boolean
  696. @param showTotal: If true, prints the sum of all frequencies (of
  697. the entire FreqDist, not just of the samples displayed.) Defaults
  698. to False.
  699. """
  700. # for each FreqDist:
  701. for (dist, name) in output:
  702. x = 0
  703. other = 0
  704. total = 0
  705. print name
  706. print "-"*(maxKeyLength + 7)
  707. # for each key:
  708. for key in dist.keys():
  709. # keep track of how many samples shown, if using the showFirstX
  710. # option
  711. #if showFirstX > 0 and x >= showFirstX:
  712. # break
  713. # get and format the sample's frequency
  714. if normalise:
  715. count = 1.0 * dist[key] / dist.N()
  716. countString = str(count)[0:decimalPlaces + 2]
  717. else:
  718. count = dist[key]
  719. countString = str(count)
  720. total += count
  721. # if the count is less than the threshold value, or we've
  722. # already shown X samples, add this sample's frequency to the
  723. # 'other' bin
  724. if count < threshold or (showFirstX > 0 and x >= showFirstX):
  725. other += count
  726. else:
  727. print key + " "*(maxKeyLength - len(key) + 1) + countString
  728. x += 1
  729. if countOther:
  730. if normalise:
  731. count = 1.0 * other
  732. countString = str(count)[0:decimalPlaces + 2]
  733. else:
  734. count = other
  735. countString = str(count)
  736. print self._OTHER_TEXT + " "*(maxKeyLength - len(self._OTHER_TEXT) + 1) + countString
  737. if showTotal:
  738. if normalise:
  739. count = 1.0 * total
  740. countString = str(count)[0:decimalPlaces + 2]
  741. else:
  742. count = total
  743. countString = str(count)
  744. print self._TOTAL_TEXT + " "*(maxKeyLength - len(self._TOTAL_TEXT) + 1) + countString
  745. print ""
  746. def demo():
  747. """
  748. Demonstrates how to use IndexConcordance and Aggregator.
  749. """
  750. print "Reading Brown Corpus into memory..."
  751. corpus = brown.tagged_sents('a')
  752. print "Generating index..."
  753. ic = IndexConcordance(corpus)
  754. print "Showing all occurences of 'plasma' in the Brown Corpus..."
  755. ic.formatted(middleRegexp="^plasma/.*", verbose=True)
  756. print "Investigating the collocates of 'deal' and derivatives..."
  757. agg = Aggregator()
  758. agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0,
  759. leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates")
  760. agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1,
  761. rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates")
  762. agg.formatted(showFirstX=5, usePOS=False)
  763. if __name__ == '__main__':
  764. demo()