PageRenderTime 56ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/test/test_nl.py

https://gitlab.com/github-cloud-corporation/pattern
Python | 273 lines | 270 code | 2 blank | 1 comment | 1 complexity | 19a5146666a867094d7b8d7fd4e1685b MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  3. import unittest
  4. import subprocess
  5. from pattern import nl
  6. try:
  7. PATH = os.path.dirname(os.path.realpath(__file__))
  8. except:
  9. PATH = ""
  10. #---------------------------------------------------------------------------------------------------
  11. class TestInflection(unittest.TestCase):
  12. def setUp(self):
  13. pass
  14. def test_pluralize(self):
  15. # Assert "auto's" as plural of "auto".
  16. self.assertEqual("auto's", nl.inflect.pluralize("auto"))
  17. # Assert the accuracy of the pluralization algorithm.
  18. from pattern.db import Datasheet
  19. i, n = 0, 0
  20. for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
  21. if nl.pluralize(sg) == pl:
  22. i +=1
  23. n += 1
  24. self.assertTrue(float(i) / n > 0.74)
  25. print("pattern.nl.pluralize()")
  26. def test_singularize(self):
  27. # Assert the accuracy of the singularization algorithm.
  28. from pattern.db import Datasheet
  29. i, n = 0, 0
  30. for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
  31. if nl.singularize(pl) == sg:
  32. i +=1
  33. n += 1
  34. self.assertTrue(float(i) / n > 0.88)
  35. print("pattern.nl.singularize()")
  36. def test_attributive(self):
  37. # Assert the accuracy of the attributive algorithm ("fel" => "felle").
  38. from pattern.db import Datasheet
  39. i, n = 0, 0
  40. for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
  41. if nl.attributive(pred) == attr:
  42. i +=1
  43. n += 1
  44. self.assertTrue(float(i) / n > 0.96)
  45. print("pattern.nl.attributive()")
  46. def test_predicative(self):
  47. # Assert the accuracy of the predicative algorithm ("felle" => "fel").
  48. from pattern.db import Datasheet
  49. i, n = 0, 0
  50. for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
  51. if nl.predicative(attr) == pred:
  52. i +=1
  53. n += 1
  54. self.assertTrue(float(i) / n > 0.96)
  55. print("pattern.nl.predicative()")
  56. def test_find_lemma(self):
  57. # Assert the accuracy of the verb lemmatization algorithm.
  58. # Note: the accuracy is higher (90%) when measured on CELEX word forms
  59. # (presumably because nl.inflect.verbs has high percentage irregular verbs).
  60. i, n = 0, 0
  61. for v1, v2 in nl.inflect.verbs.inflections.items():
  62. if nl.inflect.verbs.find_lemma(v1) == v2:
  63. i += 1
  64. n += 1
  65. self.assertTrue(float(i) / n > 0.83)
  66. print("pattern.nl.inflect.verbs.find_lemma()")
  67. def test_find_lexeme(self):
  68. # Assert the accuracy of the verb conjugation algorithm.
  69. i, n = 0, 0
  70. for v, lexeme1 in nl.inflect.verbs.infinitives.items():
  71. lexeme2 = nl.inflect.verbs.find_lexeme(v)
  72. for j in range(len(lexeme2)):
  73. if lexeme1[j] == lexeme2[j] or \
  74. lexeme1[j] == "" and \
  75. lexeme1[j > 5 and 10 or 0] == lexeme2[j]:
  76. i += 1
  77. n += 1
  78. self.assertTrue(float(i) / n > 0.79)
  79. print("pattern.nl.inflect.verbs.find_lexeme()")
  80. def test_conjugate(self):
  81. # Assert different tenses with different conjugations.
  82. for (v1, v2, tense) in (
  83. ("zijn", "zijn", nl.INFINITIVE),
  84. ("zijn", "ben", (nl.PRESENT, 1, nl.SINGULAR)),
  85. ("zijn", "bent", (nl.PRESENT, 2, nl.SINGULAR)),
  86. ("zijn", "is", (nl.PRESENT, 3, nl.SINGULAR)),
  87. ("zijn", "zijn", (nl.PRESENT, 0, nl.PLURAL)),
  88. ("zijn", "zijnd", (nl.PRESENT + nl.PARTICIPLE,)),
  89. ("zijn", "was", (nl.PAST, 1, nl.SINGULAR)),
  90. ("zijn", "was", (nl.PAST, 2, nl.SINGULAR)),
  91. ("zijn", "was", (nl.PAST, 3, nl.SINGULAR)),
  92. ("zijn", "waren", (nl.PAST, 0, nl.PLURAL)),
  93. ("zijn", "was", (nl.PAST, 0, None)),
  94. ("zijn", "geweest", (nl.PAST + nl.PARTICIPLE,)),
  95. ("had", "hebben", "inf"),
  96. ("had", "heb", "1sg"),
  97. ("had", "hebt", "2sg"),
  98. ("had", "heeft", "3sg"),
  99. ("had", "hebben", "pl"),
  100. ("had", "hebbend", "part"),
  101. ("heeft", "had", "1sgp"),
  102. ("heeft", "had", "2sgp"),
  103. ("heeft", "had", "3sgp"),
  104. ("heeft", "hadden", "ppl"),
  105. ("heeft", "had", "p"),
  106. ("heeft", "gehad", "ppart"),
  107. ("smsen", "smste", "3sgp")):
  108. self.assertEqual(nl.conjugate(v1, tense), v2)
  109. print("pattern.nl.conjugate()")
  110. def test_lexeme(self):
  111. # Assert all inflections of "zijn".
  112. v = nl.lexeme("zijn")
  113. self.assertEqual(v, [
  114. "zijn", "ben", "bent", "is", "zijnd", "waren", "was", "geweest"
  115. ])
  116. print("pattern.nl.inflect.lexeme()")
  117. def test_tenses(self):
  118. # Assert tense recognition.
  119. self.assertTrue((nl.PRESENT, 3, "sg") in nl.tenses("is"))
  120. self.assertTrue("3sg" in nl.tenses("is"))
  121. print("pattern.nl.tenses()")
  122. #---------------------------------------------------------------------------------------------------
  123. class TestParser(unittest.TestCase):
  124. def setUp(self):
  125. pass
  126. def test_wotan2penntreebank(self):
  127. # Assert tag translation.
  128. for penntreebank, wotan in (
  129. ("NNP", "N(eigen,ev,neut)"),
  130. ("NNPS", "N(eigen,mv,neut)"),
  131. ("NN", "N(soort,ev,neut)"),
  132. ("NNS", "N(soort,mv,neut)"),
  133. ("VBZ", "V(refl,ott,3,ev)"),
  134. ("VBP", "V(intrans,ott,1_of_2_of_3,mv)"),
  135. ("VBD", "V(trans,ovt,1_of_2_of_3,mv)"),
  136. ("VBN", "V(trans,verl_dw,onverv)"),
  137. ("VBG", "V(intrans,teg_dw,onverv)"),
  138. ("VB", "V(intrans,inf)"),
  139. ("MD", "V(hulp_of_kopp,ott,3,ev)"),
  140. ("JJ", "Adj(attr,stell,onverv)"),
  141. ("JJR", "Adj(adv,vergr,onverv)"),
  142. ("JJS", "Adj(attr,overtr,verv_neut)"),
  143. ("RP", "Adv(deel_v)"),
  144. ("RB", "Adv(gew,geen_func,stell,onverv)"),
  145. ("DT", "Art(bep,zijd_of_mv,neut)"),
  146. ("CC", "Conj(neven)"),
  147. ("CD", "Num(hoofd,bep,zelfst,onverv)"),
  148. ("TO", "Prep(voor_inf)"),
  149. ("IN", "Prep(voor)"),
  150. ("PRP", "Pron(onbep,neut,attr)"),
  151. ("PRP$", "Pron(bez,2,ev,neut,attr)"),
  152. (",", "Punc(komma)"),
  153. ("(", "Punc(haak_open)"),
  154. (")", "Punc(haak_sluit)"),
  155. (".", "Punc(punt)"),
  156. ("UH", "Int"),
  157. ("SYM", "Misc(symbool)")):
  158. self.assertEqual(nl.wotan2penntreebank("", wotan)[1], penntreebank)
  159. print("pattern.nl.wotan2penntreebank()")
  160. def test_find_lemmata(self):
  161. # Assert lemmata for nouns and verbs.
  162. v = nl.parser.find_lemmata([["katten", "NNS"], ["droegen", "VBD"], ["hoeden", "NNS"]])
  163. self.assertEqual(v, [
  164. ["katten", "NNS", "kat"],
  165. ["droegen", "VBD", "dragen"],
  166. ["hoeden", "NNS", "hoed"]])
  167. print("pattern.nl.parser.find_lemmata()")
  168. def test_parse(self):
  169. # Assert parsed output with Penn Treebank II tags (slash-formatted).
  170. # 1) "de zwarte kat" is a noun phrase, "op de mat" is a prepositional noun phrase.
  171. v = nl.parser.parse("De zwarte kat zat op de mat.")
  172. self.assertEqual(v,
  173. "De/DT/B-NP/O zwarte/JJ/I-NP/O kat/NN/I-NP/O " + \
  174. "zat/VBD/B-VP/O " + \
  175. "op/IN/B-PP/B-PNP de/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
  176. )
  177. # 2) "jaagt" and "vogels" lemmata are "jagen" and "vogel".
  178. v = nl.parser.parse("De zwarte kat jaagt op vogels.", lemmata=True)
  179. self.assertEqual(v,
  180. "De/DT/B-NP/O/de zwarte/JJ/I-NP/O/zwart kat/NN/I-NP/O/kat " + \
  181. "jaagt/VBZ/B-VP/O/jagen " + \
  182. "op/IN/B-PP/B-PNP/op vogels/NNS/B-NP/I-PNP/vogel ././O/O/."
  183. )
  184. # Assert the accuracy of the Dutch tagger.
  185. i, n = 0, 0
  186. for sentence in open(os.path.join(PATH, "corpora", "tagged-nl-twnc.txt")).readlines():
  187. sentence = sentence.decode("utf-8").strip()
  188. s1 = [w.split("/") for w in sentence.split(" ")]
  189. s1 = [nl.wotan2penntreebank(w, tag) for w, tag in s1]
  190. s2 = [[w for w, pos in s1]]
  191. s2 = nl.parse(s2, tokenize=False)
  192. s2 = [w.split("/") for w in s2.split(" ")]
  193. for j in range(len(s1)):
  194. if s1[j][1] == s2[j][1]:
  195. i += 1
  196. n += 1
  197. self.assertTrue(float(i) / n > 0.90)
  198. print("pattern.nl.parser.parse()")
  199. def test_tag(self):
  200. # Assert [("zwarte", "JJ"), ("panters", "NNS")].
  201. v = nl.tag("zwarte panters")
  202. self.assertEqual(v, [("zwarte", "JJ"), ("panters", "NNS")])
  203. print("pattern.nl.tag()")
  204. def test_command_line(self):
  205. # Assert parsed output from the command-line (example from the documentation).
  206. p = ["python", "-m", "pattern.nl", "-s", "Leuke kat.", "-OTCRL"]
  207. p = subprocess.Popen(p, stdout=subprocess.PIPE)
  208. p.wait()
  209. v = p.stdout.read()
  210. v = v.strip()
  211. self.assertEqual(v, "Leuke/JJ/B-NP/O/O/leuk kat/NN/I-NP/O/O/kat ././O/O/O/.")
  212. print("python -m pattern.nl")
  213. #---------------------------------------------------------------------------------------------------
  214. class TestSentiment(unittest.TestCase):
  215. def setUp(self):
  216. pass
  217. def test_sentiment(self):
  218. # Assert < 0 for negative adjectives and > 0 for positive adjectives.
  219. self.assertTrue(nl.sentiment("geweldig")[0] > 0)
  220. self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
  221. # Assert the accuracy of the sentiment analysis.
  222. # Given are the scores for 3,000 book reviews.
  223. # The baseline should increase (not decrease) when the algorithm is modified.
  224. from pattern.db import Datasheet
  225. from pattern.metrics import test
  226. reviews = []
  227. for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
  228. reviews.append((review, int(score) > 0))
  229. A, P, R, F = test(lambda review: nl.positive(review), reviews)
  230. #print(A, P, R, F)
  231. self.assertTrue(A > 0.808)
  232. self.assertTrue(P > 0.780)
  233. self.assertTrue(R > 0.860)
  234. self.assertTrue(F > 0.818)
  235. print("pattern.nl.sentiment()")
  236. #---------------------------------------------------------------------------------------------------
  237. def suite():
  238. suite = unittest.TestSuite()
  239. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
  240. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
  241. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSentiment))
  242. return suite
  243. if __name__ == "__main__":
  244. unittest.TextTestRunner(verbosity=1).run(suite())