PageRenderTime 66ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/test/test_it.py

https://gitlab.com/admin-github-cloud/pattern
Python | 272 lines | 269 code | 2 blank | 1 comment | 1 complexity | c0f6eed4004f0cf1ceb213913bac25be MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  3. import unittest
  4. import subprocess
  5. from pattern import it
  6. try:
  7. PATH = os.path.dirname(os.path.realpath(__file__))
  8. except:
  9. PATH = ""
  10. #---------------------------------------------------------------------------------------------------
  11. class TestInflection(unittest.TestCase):
  12. def setUp(self):
  13. pass
  14. def test_article(self):
  15. # Assert definite and indefinite article inflection.
  16. for a, n, g in (
  17. ("il" , "giorno" , it.M),
  18. ("l'" , "altro giorno", it.M),
  19. ("lo" , "zio" , it.M),
  20. ("l'" , "amica" , it.F),
  21. ("la" , "nouva amica" , it.F),
  22. ("i" , "giapponesi" , it.M + it.PL),
  23. ("gli", "italiani" , it.M + it.PL),
  24. ("gli", "zii" , it.M + it.PL),
  25. ("le" , "zie" , it.F + it.PL)):
  26. v = it.article(n, "definite", gender=g)
  27. self.assertEqual(a, v)
  28. for a, n, g in (
  29. ("uno", "zio" , it.M),
  30. ("una", "zia" , it.F),
  31. ("un" , "amico", it.M),
  32. ("un'", "amica", it.F)):
  33. v = it.article(n, "indefinite", gender=g)
  34. self.assertEqual(a, v)
  35. v = it.referenced("amica", gender="f")
  36. self.assertEqual(v, "un'amica")
  37. print("pattern.it.article()")
  38. print("pattern.it.referenced()")
  39. def test_gender(self):
  40. # Assert the accuracy of the gender disambiguation algorithm.
  41. from pattern.db import Datasheet
  42. i, n = 0, 0
  43. for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
  44. g = it.gender(sg)
  45. if mf in g and it.PLURAL not in g:
  46. i += 1
  47. g = it.gender(pl)
  48. if mf in g and it.PLURAL in g:
  49. i += 1
  50. n += 2
  51. self.assertTrue(float(i) / n > 0.92)
  52. print("pattern.it.gender()")
  53. def test_pluralize(self):
  54. # Assert the accuracy of the pluralization algorithm.
  55. from pattern.db import Datasheet
  56. i, n = 0, 0
  57. for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
  58. if it.pluralize(sg) == pl:
  59. i += 1
  60. n += 1
  61. self.assertTrue(float(i) / n > 0.93)
  62. print("pattern.it.pluralize()")
  63. def test_singularize(self):
  64. # Assert the accuracy of the singularization algorithm.
  65. from pattern.db import Datasheet
  66. i, n = 0, 0
  67. for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
  68. if it.singularize(pl) == sg:
  69. i += 1
  70. n += 1
  71. self.assertTrue(float(i) / n > 0.84)
  72. print("pattern.it.singularize()")
  73. def test_predicative(self):
  74. # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale").
  75. from pattern.db import Datasheet
  76. i, n = 0, 0
  77. for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
  78. if pos != "j":
  79. continue
  80. if it.predicative(pl) == sg:
  81. i += 1
  82. n += 1
  83. self.assertTrue(float(i) / n > 0.87)
  84. print("pattern.it.predicative()")
  85. def test_find_lemma(self):
  86. # Assert the accuracy of the verb lemmatization algorithm.
  87. i, n = 0, 0
  88. r = 0
  89. for v1, v2 in it.inflect.verbs.inflections.items():
  90. if it.inflect.verbs.find_lemma(v1) == v2:
  91. i += 1
  92. n += 1
  93. self.assertTrue(float(i) / n > 0.81)
  94. print("pattern.it.inflect.verbs.find_lemma()")
  95. def test_find_lexeme(self):
  96. # Assert the accuracy of the verb conjugation algorithm.
  97. i, n = 0, 0
  98. for v, lexeme1 in it.inflect.verbs.infinitives.items():
  99. lexeme2 = it.inflect.verbs.find_lexeme(v)
  100. for j in range(len(lexeme2)):
  101. if lexeme1[j] == lexeme2[j]:
  102. i += 1
  103. n += 1
  104. self.assertTrue(float(i) / n > 0.89)
  105. print("pattern.it.inflect.verbs.find_lexeme()")
  106. def test_conjugate(self):
  107. # Assert different tenses with different conjugations.
  108. for (v1, v2, tense) in (
  109. ("essere", u"essere", it.INFINITIVE),
  110. ("essere", u"sono", (it.PRESENT, 1, it.SINGULAR)),
  111. ("essere", u"sei", (it.PRESENT, 2, it.SINGULAR)),
  112. ("essere", u"è", (it.PRESENT, 3, it.SINGULAR)),
  113. ("essere", u"siamo", (it.PRESENT, 1, it.PLURAL)),
  114. ("essere", u"siete", (it.PRESENT, 2, it.PLURAL)),
  115. ("essere", u"sono", (it.PRESENT, 3, it.PLURAL)),
  116. ("essere", u"essendo", (it.PRESENT + it.PARTICIPLE)),
  117. ("essere", u"stato", (it.PAST + it.PARTICIPLE)),
  118. ("essere", u"ero", (it.IMPERFECT, 1, it.SINGULAR)),
  119. ("essere", u"eri", (it.IMPERFECT, 2, it.SINGULAR)),
  120. ("essere", u"era", (it.IMPERFECT, 3, it.SINGULAR)),
  121. ("essere", u"eravamo", (it.IMPERFECT, 1, it.PLURAL)),
  122. ("essere", u"eravate", (it.IMPERFECT, 2, it.PLURAL)),
  123. ("essere", u"erano", (it.IMPERFECT, 3, it.PLURAL)),
  124. ("essere", u"fui", (it.PRETERITE, 1, it.SINGULAR)),
  125. ("essere", u"fosti", (it.PRETERITE, 2, it.SINGULAR)),
  126. ("essere", u"fu", (it.PRETERITE, 3, it.SINGULAR)),
  127. ("essere", u"fummo", (it.PRETERITE, 1, it.PLURAL)),
  128. ("essere", u"foste", (it.PRETERITE, 2, it.PLURAL)),
  129. ("essere", u"furono", (it.PRETERITE, 3, it.PLURAL)),
  130. ("essere", u"sarei", (it.CONDITIONAL, 1, it.SINGULAR)),
  131. ("essere", u"saresti", (it.CONDITIONAL, 2, it.SINGULAR)),
  132. ("essere", u"sarebbe", (it.CONDITIONAL, 3, it.SINGULAR)),
  133. ("essere", u"saremmo", (it.CONDITIONAL, 1, it.PLURAL)),
  134. ("essere", u"sareste", (it.CONDITIONAL, 2, it.PLURAL)),
  135. ("essere", u"sarebbero", (it.CONDITIONAL, 3, it.PLURAL)),
  136. ("essere", u"sarò", (it.FUTURE, 1, it.SINGULAR)),
  137. ("essere", u"sarai", (it.FUTURE, 2, it.SINGULAR)),
  138. ("essere", u"sarà", (it.FUTURE, 3, it.SINGULAR)),
  139. ("essere", u"saremo", (it.FUTURE, 1, it.PLURAL)),
  140. ("essere", u"sarete", (it.FUTURE, 2, it.PLURAL)),
  141. ("essere", u"saranno", (it.FUTURE, 3, it.PLURAL)),
  142. ("essere", u"sii", (it.PRESENT, 2, it.SINGULAR, it.IMPERATIVE)),
  143. ("essere", u"sia", (it.PRESENT, 3, it.SINGULAR, it.IMPERATIVE)),
  144. ("essere", u"siamo", (it.PRESENT, 1, it.PLURAL, it.IMPERATIVE)),
  145. ("essere", u"siate", (it.PRESENT, 2, it.PLURAL, it.IMPERATIVE)),
  146. ("essere", u"siano", (it.PRESENT, 3, it.PLURAL, it.IMPERATIVE)),
  147. ("essere", u"sia", (it.PRESENT, 1, it.SINGULAR, it.SUBJUNCTIVE)),
  148. ("essere", u"sia", (it.PRESENT, 2, it.SINGULAR, it.SUBJUNCTIVE)),
  149. ("essere", u"sia", (it.PRESENT, 3, it.SINGULAR, it.SUBJUNCTIVE)),
  150. ("essere", u"siamo", (it.PRESENT, 1, it.PLURAL, it.SUBJUNCTIVE)),
  151. ("essere", u"siate", (it.PRESENT, 2, it.PLURAL, it.SUBJUNCTIVE)),
  152. ("essere", u"siano", (it.PRESENT, 3, it.PLURAL, it.SUBJUNCTIVE)),
  153. ("essere", u"fossi", (it.PAST, 1, it.SINGULAR, it.SUBJUNCTIVE)),
  154. ("essere", u"fossi", (it.PAST, 2, it.SINGULAR, it.SUBJUNCTIVE)),
  155. ("essere", u"fosse", (it.PAST, 3, it.SINGULAR, it.SUBJUNCTIVE)),
  156. ("essere", u"fossimo", (it.PAST, 1, it.PLURAL, it.SUBJUNCTIVE)),
  157. ("essere", u"foste", (it.PAST, 2, it.PLURAL, it.SUBJUNCTIVE)),
  158. ("essere", u"fossero", (it.PAST, 3, it.PLURAL, it.SUBJUNCTIVE))):
  159. self.assertEqual(it.conjugate(v1, tense), v2)
  160. print("pattern.it.conjugate()")
  161. def test_lexeme(self):
  162. # Assert all inflections of "essere".
  163. v = it.lexeme("essere")
  164. self.assertEqual(v, [
  165. u'essere', u'sono', u'sei', u'è', u'siamo', u'siete', u'essendo',
  166. u'fui', u'fosti', u'fu', u'fummo', u'foste', u'furono', u'stato',
  167. u'ero', u'eri', u'era', u'eravamo', u'eravate', u'erano',
  168. u'sarò', u'sarai', u'sarà', u'saremo', u'sarete', u'saranno',
  169. u'sarei', u'saresti', u'sarebbe', u'saremmo', u'sareste', u'sarebbero',
  170. u'sii', u'sia', u'siate', u'siano',
  171. u'fossi', u'fosse', u'fossimo', u'fossero'
  172. ])
  173. print("pattern.it.inflect.lexeme()")
  174. def test_tenses(self):
  175. # Assert tense recognition.
  176. self.assertTrue((it.PRESENT, 3, it.SG) in it.tenses(u"è"))
  177. self.assertTrue("2sg" in it.tenses("sei"))
  178. print("pattern.it.tenses()")
  179. #---------------------------------------------------------------------------------------------------
  180. class TestParser(unittest.TestCase):
  181. def setUp(self):
  182. pass
  183. def test_find_lemmata(self):
  184. # Assert lemmata for nouns, adjectives, verbs and determiners.
  185. v = it.parser.find_lemmata([
  186. ["I", "DT"], ["gatti", "NNS"], ["neri", "JJ"],
  187. ["seduti", "VB"], ["sul", "IN"], ["tatami", "NN"]])
  188. self.assertEqual(v, [
  189. ["I", "DT", "il"],
  190. ["gatti", "NNS", "gatto"],
  191. ["neri", "JJ", "nero"],
  192. ["seduti", "VB", "sedutare"],
  193. ["sul", "IN", "sul"],
  194. ["tatami", "NN", "tatami"]])
  195. print("pattern.it.parser.find_lemmata()")
  196. def test_parse(self):
  197. # Assert parsed output with Penn Treebank II tags (slash-formatted).
  198. # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase.
  199. v = it.parser.parse(u"Il gatto nero seduto sulla stuoia.")
  200. self.assertEqual(v,
  201. u"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " +
  202. u"seduto/VB/B-VP/O " + \
  203. u"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O"
  204. )
  205. # Assert the accuracy of the Italian tagger.
  206. i, n = 0, 0
  207. for sentence in open(os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines():
  208. sentence = sentence.decode("utf-8").strip()
  209. s1 = [w.split("/") for w in sentence.split(" ")]
  210. s2 = [[w for w, pos in s1]]
  211. s2 = it.parse(s2, tokenize=False)
  212. s2 = [w.split("/") for w in s2.split(" ")]
  213. for j in range(len(s1)):
  214. t1 = s1[j][1]
  215. t2 = s2[j][1]
  216. # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS".
  217. # Some punctuation marks are also tagged differently,
  218. # but these are not necessarily errors.
  219. if t1 == t2 or (t1 == "NN" and t2.startswith("NN")) or s1[j][0] in "\":;)-":
  220. i += 1
  221. n += 1
  222. #print(float(i) / n)
  223. self.assertTrue(float(i) / n > 0.92)
  224. print("pattern.it.parser.parse()")
  225. def test_tag(self):
  226. # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")].
  227. v = it.tag("il gatto nero")
  228. self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")])
  229. print("pattern.it.tag()")
  230. def test_command_line(self):
  231. # Assert parsed output from the command-line (example from the documentation).
  232. p = ["python", "-m", "pattern.it", "-s", "Il gatto nero.", "-OTCRL"]
  233. p = subprocess.Popen(p, stdout=subprocess.PIPE)
  234. p.wait()
  235. v = p.stdout.read()
  236. v = v.strip()
  237. self.assertEqual(v, "Il/DT/B-NP/O/O/il gatto/NN/I-NP/O/O/gatto nero/JJ/I-NP/O/O/nero ././O/O/O/.")
  238. print("python -m pattern.it")
  239. #---------------------------------------------------------------------------------------------------
  240. def suite():
  241. suite = unittest.TestSuite()
  242. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
  243. suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
  244. return suite
  245. if __name__ == "__main__":
  246. unittest.TextTestRunner(verbosity=1).run(suite())