PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/test/test_es.py

https://gitlab.com/admin-github-cloud/pattern
Python | 256 lines | 253 code | 2 blank | 1 comment | 1 complexity | cdb2580d325151309bea15a52bb1c8dd MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  3. import unittest
  4. import subprocess
  5. from pattern import es
  6. try:
  7. PATH = os.path.dirname(os.path.realpath(__file__))
  8. except:
  9. PATH = ""
  10. #---------------------------------------------------------------------------------------------------
  11. class TestInflection(unittest.TestCase):
  12. def setUp(self):
  13. pass
  14. def test_pluralize(self):
  15. # Assert the accuracy of the pluralization algorithm.
  16. from pattern.db import Datasheet
  17. test = {}
  18. for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
  19. if tag == "n": test.setdefault(lemma, []).append(w)
  20. i, n = 0, 0
  21. for sg, pl in test.items():
  22. pl = sorted(pl, key=len, reverse=True)[0]
  23. if es.pluralize(sg) == pl:
  24. i += 1
  25. n += 1
  26. self.assertTrue(float(i) / n > 0.77)
  27. print("pattern.es.pluralize()")
  28. def test_singularize(self):
  29. # Assert the accuracy of the singularization algorithm.
  30. from pattern.db import Datasheet
  31. test = {}
  32. for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
  33. if tag == "n": test.setdefault(lemma, []).append(w)
  34. i, n = 0, 0
  35. for sg, pl in test.items():
  36. pl = sorted(pl, key=len, reverse=True)[0]
  37. if es.singularize(pl) == sg:
  38. i += 1
  39. n += 1
  40. self.assertTrue(float(i) / n > 0.93)
  41. print("pattern.es.singularize()")
  42. def test_attributive(self):
  43. # Assert "alto" => "altos" (masculine, plural), and others.
  44. for lemma, inflected, gender in (
  45. (u"alto", u"alto", es.MALE + es.SINGULAR),
  46. (u"alto", u"altos", es.MALE + es.PLURAL),
  47. (u"alto", u"alta", es.FEMALE + es.SINGULAR),
  48. (u"alto", u"altas", es.FEMALE + es.PLURAL),
  49. (u"verde", u"verdes", es.MALE + es.PLURAL),
  50. (u"verde", u"verdes", es.FEMALE + es.PLURAL)):
  51. v = es.attributive(lemma, gender)
  52. self.assertEqual(v, inflected)
  53. print("pattern.es.attributive()")
  54. def test_predicative(self):
  55. # Assert the accuracy of the predicative algorithm ("horribles" => "horrible").
  56. from pattern.db import Datasheet
  57. test = {}
  58. for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
  59. if tag == "j": test.setdefault(lemma, []).append(w)
  60. i, n = 0, 0
  61. for pred, attr in test.items():
  62. attr = sorted(attr, key=len, reverse=True)[0]
  63. if es.predicative(attr) == pred:
  64. i += 1
  65. n += 1
  66. self.assertTrue(float(i) / n > 0.92)
  67. print("pattern.es.predicative()")
  68. def test_find_lemma(self):
  69. # Assert the accuracy of the verb lemmatization algorithm.
  70. i, n = 0, 0
  71. for v1, v2 in es.inflect.verbs.inflections.items():
  72. if es.inflect.verbs.find_lemma(v1) == v2:
  73. i += 1
  74. n += 1
  75. self.assertTrue(float(i) / n > 0.80)
  76. print("pattern.es.inflect.verbs.find_lemma()")
  77. def test_find_lexeme(self):
  78. # Assert the accuracy of the verb conjugation algorithm.
  79. i, n = 0, 0
  80. for v, lexeme1 in es.inflect.verbs.infinitives.items():
  81. lexeme2 = es.inflect.verbs.find_lexeme(v)
  82. for j in range(len(lexeme2)):
  83. if lexeme1[j] == lexeme2[j]:
  84. i += 1
  85. n += 1
  86. self.assertTrue(float(i) / n > 0.85)
  87. print("pattern.es.inflect.verbs.find_lexeme()")
  88. def test_conjugate(self):
  89. # Assert different tenses with different conjugations.
  90. for (v1, v2, tense) in (
  91. ("ser", u"ser", es.INFINITIVE),
  92. ("ser", u"soy", (es.PRESENT, 1, es.SINGULAR)),
  93. ("ser", u"eres", (es.PRESENT, 2, es.SINGULAR)),
  94. ("ser", u"es", (es.PRESENT, 3, es.SINGULAR)),
  95. ("ser", u"somos", (es.PRESENT, 1, es.PLURAL)),
  96. ("ser", u"sois", (es.PRESENT, 2, es.PLURAL)),
  97. ("ser", u"son", (es.PRESENT, 3, es.PLURAL)),
  98. ("ser", u"siendo", (es.PRESENT + es.PARTICIPLE)),
  99. ("ser", u"sido", (es.PAST + es.PARTICIPLE)),
  100. ("ser", u"era", (es.IMPERFECT, 1, es.SINGULAR)),
  101. ("ser", u"eras", (es.IMPERFECT, 2, es.SINGULAR)),
  102. ("ser", u"era", (es.IMPERFECT, 3, es.SINGULAR)),
  103. ("ser", u"éramos", (es.IMPERFECT, 1, es.PLURAL)),
  104. ("ser", u"erais", (es.IMPERFECT, 2, es.PLURAL)),
  105. ("ser", u"eran", (es.IMPERFECT, 3, es.PLURAL)),
  106. ("ser", u"fui", (es.PRETERITE, 1, es.SINGULAR)),
  107. ("ser", u"fuiste", (es.PRETERITE, 2, es.SINGULAR)),
  108. ("ser", u"fue", (es.PRETERITE, 3, es.SINGULAR)),
  109. ("ser", u"fuimos", (es.PRETERITE, 1, es.PLURAL)),
  110. ("ser", u"fuisteis", (es.PRETERITE, 2, es.PLURAL)),
  111. ("ser", u"fueron", (es.PRETERITE, 3, es.PLURAL)),
  112. ("ser", u"sería", (es.CONDITIONAL, 1, es.SINGULAR)),
  113. ("ser", u"serías", (es.CONDITIONAL, 2, es.SINGULAR)),
  114. ("ser", u"sería", (es.CONDITIONAL, 3, es.SINGULAR)),
  115. ("ser", u"seríamos", (es.CONDITIONAL, 1, es.PLURAL)),
  116. ("ser", u"seríais", (es.CONDITIONAL, 2, es.PLURAL)),
  117. ("ser", u"serían", (es.CONDITIONAL, 3, es.PLURAL)),
  118. ("ser", u"seré", (es.FUTURE, 1, es.SINGULAR)),
  119. ("ser", u"serás", (es.FUTURE, 2, es.SINGULAR)),
  120. ("ser", u"será", (es.FUTURE, 3, es.SINGULAR)),
  121. ("ser", u"seremos", (es.FUTURE, 1, es.PLURAL)),
  122. ("ser", u"seréis", (es.FUTURE, 2, es.PLURAL)),
  123. ("ser", u"serán", (es.FUTURE, 3, es.PLURAL)),
  124. ("ser", u"sé", (es.PRESENT, 2, es.SINGULAR, es.IMPERATIVE)),
  125. ("ser", u"sed", (es.PRESENT, 2, es.PLURAL, es.IMPERATIVE)),
  126. ("ser", u"sea", (es.PRESENT, 1, es.SINGULAR, es.SUBJUNCTIVE)),
  127. ("ser", u"seas", (es.PRESENT, 2, es.SINGULAR, es.SUBJUNCTIVE)),
  128. ("ser", u"sea", (es.PRESENT, 3, es.SINGULAR, es.SUBJUNCTIVE)),
  129. ("ser", u"seamos", (es.PRESENT, 1, es.PLURAL, es.SUBJUNCTIVE)),
  130. ("ser", u"seáis", (es.PRESENT, 2, es.PLURAL, es.SUBJUNCTIVE)),
  131. ("ser", u"sean", (es.PRESENT, 3, es.PLURAL, es.SUBJUNCTIVE)),
  132. ("ser", u"fuera", (es.PAST, 1, es.SINGULAR, es.SUBJUNCTIVE)),
  133. ("ser", u"fueras", (es.PAST, 2, es.SINGULAR, es.SUBJUNCTIVE)),
  134. ("ser", u"fuera", (es.PAST, 3, es.SINGULAR, es.SUBJUNCTIVE)),
  135. ("ser", u"fuéramos", (es.PAST, 1, es.PLURAL, es.SUBJUNCTIVE)),
  136. ("ser", u"fuerais", (es.PAST, 2, es.PLURAL, es.SUBJUNCTIVE)),
  137. ("ser", u"fueran", (es.PAST, 3, es.PLURAL, es.SUBJUNCTIVE))):
  138. self.assertEqual(es.conjugate(v1, tense), v2)
  139. print("pattern.es.conjugate()")
  140. def test_lexeme(self):
  141. # Assert all inflections of "ser".
  142. v = es.lexeme("ser")
  143. self.assertEqual(v, [
  144. u'ser', u'soy', u'eres', u'es', u'somos', u'sois', u'son', u'siendo',
  145. u'fui', u'fuiste', u'fue', u'fuimos', u'fuisteis', u'fueron', u'sido',
  146. u'era', u'eras', u'éramos', u'erais', u'eran',
  147. u'seré', u'serás', u'será', u'seremos', u'seréis', u'serán',
  148. u'sería', u'serías', u'seríamos', u'seríais', u'serían',
  149. u'sé', u'sed',
  150. u'sea', u'seas', u'seamos', u'seáis', u'sean',
  151. u'fuera', u'fueras', u'fuéramos', u'fuerais', u'fueran'
  152. ])
  153. print("pattern.es.inflect.lexeme()")
  154. def test_tenses(self):
  155. # Assert tense recognition.
  156. self.assertTrue((es.PRESENT, 3, es.SG) in es.tenses("es"))
  157. self.assertTrue("2sg" in es.tenses("eres"))
  158. # The CONDITIONAL is sometimes described as a mood,
  159. # and sometimes as a tense of the indicative mood (e.g., in Spanish):
  160. t1 = (es.CONDITIONAL, 1, es.SG)
  161. t2 = (es.PRESENT, 1, es.SG, es.CONDITIONAL)
  162. self.assertTrue("1sg->" in es.tenses(u"sería"))
  163. self.assertTrue(t1 in es.tenses(u"sería"))
  164. self.assertTrue(t2 in es.tenses(u"sería"))
  165. self.assertTrue(t1 in es.tenses(es.conjugate("ser", mood=es.INDICATIVE, tense=es.CONDITIONAL)))
  166. self.assertTrue(t2 in es.tenses(es.conjugate("ser", mood=es.CONDITIONAL)))
  167. print("pattern.es.tenses()")
  168. #---------------------------------------------------------------------------------------------------
  169. class TestParser(unittest.TestCase):
  170. def setUp(self):
  171. pass
  172. def test_find_lemmata(self):
  173. # Assert lemmata for nouns, adjectives, verbs and determiners.
  174. v = es.parser.find_lemmata([
  175. ["Los", "DT"], ["gatos", "NNS"], [u"negros", "JJ"], ["se", "PRP"], [u"sentó", "VB"],
  176. ["en", "IN"], ["la", "DT"], ["alfombra", "NN"]])
  177. self.assertEqual(v, [
  178. ["Los", "DT", "el"],
  179. ["gatos", "NNS", "gato"],
  180. ["negros", "JJ", "negro"],
  181. ["se", "PRP", "se"],
  182. [u"sentó", "VB", "sentar"],
  183. ["en", "IN", "en"],
  184. ["la", "DT", "el"],
  185. ["alfombra", "NN", "alfombra"]])
  186. print("pattern.es.parser.find_lemmata()")
  187. def test_parse(self):
  188. # Assert parsed output with Penn Treebank II tags (slash-formatted).
  189. # "el gato negro" is a noun phrase, "en la alfombra" is a prepositional noun phrase.
  190. v = es.parser.parse(u"El gato negro se sentó en la alfombra.")
  191. self.assertEqual(v, # XXX - shouldn't "se" be part of the verb phrase?
  192. u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " + \
  193. u"se/PRP/B-NP/O sentó/VB/B-VP/O " + \
  194. u"en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP alfombra/NN/I-NP/I-PNP ././O/O"
  195. )
  196. # Assert the accuracy of the Spanish tagger.
  197. i, n = 0, 0
  198. for sentence in open(os.path.join(PATH, "corpora", "tagged-es-wikicorpus.txt")).readlines():
  199. sentence = sentence.decode("utf-8").strip()
  200. s1 = [w.split("/") for w in sentence.split(" ")]
  201. s2 = [[w for w, pos in s1]]
  202. s2 = es.parse(s2, tokenize=False, tagset=es.PAROLE)
  203. s2 = [w.split("/") for w in s2.split(" ")]
  204. for j in range(len(s1)):
  205. if s1[j][1] == s2[j][1]:
  206. i += 1
  207. n += 1
  208. #print(float(i) / n)
  209. self.assertTrue(float(i) / n > 0.92)
  210. print("pattern.es.parser.parse()")
  211. def test_tag(self):
  212. # Assert [("el", "DT"), ("gato", "NN"), ("negro", "JJ")].
  213. v = es.tag("el gato negro")
  214. self.assertEqual(v, [("el", "DT"), ("gato", "NN"), ("negro", "JJ")])
  215. print("pattern.es.tag()")
  216. def test_command_line(self):
  217. # Assert parsed output from the command-line (example from the documentation).
  218. p = ["python", "-m", "pattern.es", "-s", "El gato negro.", "-OTCRL"]
  219. p = subprocess.Popen(p, stdout=subprocess.PIPE)
  220. p.wait()
  221. v = p.stdout.read()
  222. v = v.strip()
  223. self.assertEqual(v, "El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
  224. print("python -m pattern.es")
  225. #---------------------------------------------------------------------------------------------------
  226. def suite():
  227. suite = unittest.TestSuite()
  228. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
  229. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
  230. return suite
  231. if __name__ == "__main__":
  232. unittest.TextTestRunner(verbosity=1).run(suite())