/lib/test/core/Multilingual/Aligner/BilingualAlignerTest.php

https://gitlab.com/ElvisAns/tiki · PHP · 557 lines · 394 code · 107 blank · 56 comment · 23 complexity · 438f353be27f955c5c40f1216e799787 MD5 · raw file

  1. <?php
  2. // (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
  3. //
  4. // All Rights Reserved. See copyright.txt for details and a complete list of authors.
  5. // Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
  6. // $Id$
  7. /**
  8. * @group unit
  9. *
  10. */
  11. class Multilingual_Aligner_BilingualAlignerTest extends TikiTestCase
  12. {
  13. public function ___testReminder()
  14. {
  15. $this->fail("remember to reactivate all tests in BilingualAlignerTest");
  16. }
  17. ////////////////////////////////////////////////////////////////
  18. // Documentation tests
  19. // These tests illustrate how to use this class.
  20. ////////////////////////////////////////////////////////////////
  21. public function thisIsHowYouCreateAbilingualAligner()
  22. {
  23. $aligner = new Multilingual_Aligner_BilingualAligner();
  24. }
  25. ////////////////////////////////////////////////////////////////
  26. // Note: In the rest of these tests, you can assume that
  27. // $this->aligner is an instance of BilingualAligner
  28. // created as above.
  29. ////////////////////////////////////////////////////////////////
  30. protected function setUp(): void
  31. {
  32. $this->aligner = new Multilingual_Aligner_BilingualAligner();
  33. }
  34. public function thisIsHowYouAlignTwoTexts()
  35. {
  36. $aligner = new Multilingual_Aligner_BilingualAligner();
  37. $en_entences = ["Hello earthlings. Take me to your leader."];
  38. $fr_sentences = ["Bonjour terriens. Inutile de résister. Amenez moi à votre chef."];
  39. $aligned_sentences = $aligner->align($en_entences, $fr_sentences);
  40. $first_pair = $aligned_sentences[0];
  41. $first_en_sent = $first_pair[0];
  42. $first_fr_sent = $first_pair[1];
  43. }
  44. ////////////////////////////////////////////////////////////////
  45. // Internal tests
  46. // These tests check the internal workings of the class.
  47. ////////////////////////////////////////////////////////////////
  48. /**
  49. * @group multilingual
  50. */
  51. public function testSegmentIntoSentences()
  52. {
  53. $text = "This is sentence 1! This is sentence 2\n* This is sentence 3";
  54. $got_sentences = $this->aligner->_segment_into_sentences($text);
  55. $exp_sentences = [
  56. "This is sentence 1!",
  57. " This is sentence 2\n",
  58. "* This is sentence 3"
  59. ];
  60. $this->assertEquals(
  61. $exp_sentences,
  62. $got_sentences,
  63. "Sentences were not properly segmented"
  64. );
  65. }
  66. /**
  67. * @group multilingual
  68. */
  69. public function testSegmentParallelTextsToSentences()
  70. {
  71. $l1_text = "This is sentence 1! This is sentence 2.";
  72. $l2_text = "Voici la phrase 1! Voici la phrase 2.";
  73. $exp_l1_sentences = ["This is sentence 1!", " This is sentence 2."];
  74. $exp_l2_sentences = ["Voici la phrase 1!", " Voici la phrase 2."];
  75. $this->aligner->_segment_parallel_texts_to_sentences($l1_text, $l2_text);
  76. $this->assertEquals(
  77. $exp_l1_sentences,
  78. $this->aligner->l1_sentences,
  79. "L1 sentences not generated properly."
  80. );
  81. $this->assertEquals(
  82. $exp_l2_sentences,
  83. $this->aligner->l2_sentences,
  84. "L2 sentences not generated properly."
  85. );
  86. }
  87. /**
  88. * @group multilingual
  89. */
  90. public function testSentenceLengthDelta()
  91. {
  92. $l1_sentence = "Hello world.";
  93. $l2_sentence = "Bonjour le monde.";
  94. $this->assertSentenceLengthDeltaIs(
  95. $l1_sentence,
  96. $l2_sentence,
  97. 0.417,
  98. "Bad delta for case with two non-empty sentences."
  99. );
  100. $l1_sentence = "";
  101. $l2_sentence = "Bonjour le monde.";
  102. $this->assertSentenceLengthDeltaIs(
  103. $l1_sentence,
  104. $l2_sentence,
  105. 1,
  106. "Bad delta for case with only L1 sentence empty."
  107. );
  108. $l1_sentence = "Hello world.";
  109. $l2_sentence = "";
  110. $this->assertSentenceLengthDeltaIs(
  111. $l1_sentence,
  112. $l2_sentence,
  113. 1,
  114. "Bad delta for case with only L2 sentence empty."
  115. );
  116. $l1_sentence = "";
  117. $l2_sentence = "";
  118. $this->assertSentenceLengthDeltaIs(
  119. $l1_sentence,
  120. $l2_sentence,
  121. 0,
  122. "Bad delta for case with both sentences empty."
  123. );
  124. $l1_sentence = null;
  125. $l2_sentence = "Bonjour le monde.";
  126. $this->assertSentenceLengthDeltaIs(
  127. $l1_sentence,
  128. $l2_sentence,
  129. 1,
  130. "Bad delta for case with only L1 sentence null."
  131. );
  132. $l1_sentence = "Hello world.";
  133. $l2_sentence = null;
  134. $this->assertSentenceLengthDeltaIs(
  135. $l1_sentence,
  136. $l2_sentence,
  137. 1,
  138. "Bad delta for case with only L2 sentence null."
  139. );
  140. $l1_sentence = null;
  141. $l2_sentence = null;
  142. $this->assertSentenceLengthDeltaIs(
  143. $l1_sentence,
  144. $l2_sentence,
  145. 0,
  146. "Bad delta for case with both sentences null."
  147. );
  148. }
  149. public function __testGenerateShortestPathMatrix()
  150. {
  151. $this->fail("Expected distance matrix is missing some destinations after some changes we made. Fix it.");
  152. $this->setupSegmentedSentences();
  153. $this->aligner->_generate_shortest_path_matrix();
  154. $exp_cost_matrix = [];
  155. $exp_cost_matrix["-1n0|-1n0"]["-1m1|-1m1"] = "match_cost";
  156. $exp_cost_matrix["-1n0|-1n0"]["-1m2|-1m1"] = "match_cost";
  157. $exp_cost_matrix["-1n0|-1n0"]["-1m1|-1m2"] = "match_cost";
  158. $exp_cost_matrix["-1n0|-1n0"]["-1m1|-1m0"] = "match_cost";
  159. $exp_cost_matrix["-1n0|-1n0"]["-1m0|-1m1"] = "match_cost";
  160. $exp_cost_matrix["-1m1|-1m1"]["0m1|0m1"] = "match_cost";
  161. $exp_cost_matrix["-1m1|-1m1"]["0m1|0m2"] = "match_cost";
  162. $exp_cost_matrix["-1m1|-1m1"]["0m1|0m0"] = "match_cost";
  163. $exp_cost_matrix["-1m1|-1m1"]["0m0|0m1"] = "match_cost";
  164. $exp_cost_matrix["-1m1|-1m1"]["END"] = "goto_end_cost";
  165. $exp_cost_matrix["-1m2|-1m1"]["END"] = "goto_end_cost";
  166. $exp_cost_matrix["-1m2|-1m1"]["1m0|0m1"] = "match_cost";
  167. $exp_cost_matrix["-1m2|-1m1"]["1m0|0m2"] = "match_cost";
  168. $exp_cost_matrix["-1m2|-1m1"]["1m0|0m0"] = "match_cost";
  169. $exp_cost_matrix["-1m1|-1m2"]["0m1|1m1"] = "match_cost";
  170. $exp_cost_matrix["-1m1|-1m2"]["END"] = "goto_end_cost";
  171. $exp_cost_matrix["-1m1|-1m2"]["0m1|1m0"] = "match_cost";
  172. $exp_cost_matrix["-1m1|-1m2"]["0m0|1m1"] = "match_cost";
  173. $exp_cost_matrix["-1m1|-1m0"]["0m1|-1m1"] = "match_cost";
  174. $exp_cost_matrix["-1m1|-1m0"]["END"] = "goto_end_cost";
  175. $exp_cost_matrix["-1m1|-1m0"]["0m1|-1m2"] = "match_cost";
  176. $exp_cost_matrix["-1m1|-1m0"]["0m1|-1m0"] = "match_cost";
  177. $exp_cost_matrix["-1m1|-1m0"]["0m0|-1m1"] = "match_cost";
  178. $exp_cost_matrix["-1m0|-1m1"]["-1m1|0m1"] = "match_cost";
  179. $exp_cost_matrix["-1m0|-1m1"]["-1m1|0m2"] = "match_cost";
  180. $exp_cost_matrix["-1m0|-1m1"]["-1m2|0m1"] = "match_cost";
  181. $exp_cost_matrix["-1m0|-1m1"]["-1m1|0m0"] = "match_cost";
  182. $exp_cost_matrix["-1m0|-1m1"]["-1m0|0m1"] = "match_cost";
  183. $exp_cost_matrix["0m1|0m1"]["1m0|1m1"] = "match_cost";
  184. $exp_cost_matrix["0m1|0m1"]["1m0|1m0"] = "match_cost";
  185. $exp_cost_matrix["0m1|0m1"]["END"] = "goto_end_cost";
  186. $exp_cost_matrix["0m1|0m2"]["1m0|2m0"] = "match_cost";
  187. $exp_cost_matrix["0m1|0m2"]["END"] = "goto_end_cost";
  188. $exp_cost_matrix["0m1|0m0"]["1m0|0m1"] = "match_cost";
  189. $exp_cost_matrix["0m1|0m0"]["1m0|0m2"] = "match_cost";
  190. $exp_cost_matrix["0m1|0m0"]["1m0|0m0"] = "match_cost";
  191. $exp_cost_matrix["0m1|0m0"]["END"] = "goto_end_cost";
  192. $exp_cost_matrix["0m0|0m1"]["0m1|1m1"] = "match_cost";
  193. $exp_cost_matrix["0m0|0m1"]["0m1|1m0"] = "match_cost";
  194. $exp_cost_matrix["0m0|0m1"]["0m0|1m1"] = "match_cost";
  195. $exp_cost_matrix["0m0|0m1"]["END"] = "goto_end_cost";
  196. $exp_cost_matrix["1m0|0m1"]["1m0|1m1"] = "match_cost";
  197. $exp_cost_matrix["1m0|0m1"]["END"] = "goto_end_cost";
  198. $exp_cost_matrix["0m1|1m1"]["END"] = "goto_end_cost";
  199. $exp_cost_matrix["0m1|1m0"]["1m0|1m1"] = "match_cost";
  200. $exp_cost_matrix["0m1|1m0"]["END"] = "goto_end_cost";
  201. $exp_cost_matrix["0m0|1m1"]["0m1|2m0"] = "match_cost";
  202. $exp_cost_matrix["0m0|1m1"]["END"] = "goto_end_cost";
  203. $exp_cost_matrix["0m1|-1m1"]["1m0|0m1"] = "match_cost";
  204. $exp_cost_matrix["0m1|-1m1"]["END"] = "goto_end_cost";
  205. $exp_cost_matrix["0m1|-1m2"]["1m0|1m1"] = "match_cost";
  206. $exp_cost_matrix["0m1|-1m2"]["END"] = "goto_end_cost";
  207. $exp_cost_matrix["0m1|-1m0"]["1m0|-1m1"] = "match_cost";
  208. $exp_cost_matrix["0m1|-1m0"]["END"] = "goto_end_cost";
  209. $exp_cost_matrix["0m0|-1m1"]["0m1|0m1"] = "match_cost";
  210. $exp_cost_matrix["0m0|-1m1"]["0m1|0m2"] = "match_cost";
  211. $exp_cost_matrix["0m0|-1m1"]["0m1|0m0"] = "match_cost";
  212. $exp_cost_matrix["0m0|-1m1"]["0m0|0m1"] = "match_cost";
  213. $exp_cost_matrix["0m0|-1m1"]["END"] = "goto_end_cost";
  214. $exp_cost_matrix["-1m1|0m1"]["0m1|1m1"] = "match_cost";
  215. $exp_cost_matrix["-1m1|0m1"]["0m1|1m0"] = "match_cost";
  216. $exp_cost_matrix["-1m1|0m1"]["0m0|1m1"] = "match_cost";
  217. $exp_cost_matrix["-1m1|0m1"]["END"] = "goto_end_cost";
  218. $exp_cost_matrix["-1m1|0m2"]["0m1|2m0"] = "match_cost";
  219. $exp_cost_matrix["-1m1|0m2"]["END"] = "goto_end_cost";
  220. $exp_cost_matrix["-1m2|0m1"]["1m0|1m1"] = "match_cost";
  221. $exp_cost_matrix["-1m2|0m1"]["END"] = "goto_end_cost";
  222. $exp_cost_matrix["-1m1|0m0"]["0m1|0m1"] = "match_cost";
  223. $exp_cost_matrix["-1m1|0m0"]["0m1|0m2"] = "match_cost";
  224. $exp_cost_matrix["-1m1|0m0"]["0m1|0m0"] = "match_cost";
  225. $exp_cost_matrix["-1m1|0m0"]["0m0|0m1"] = "match_cost";
  226. $exp_cost_matrix["-1m1|0m0"]["END"] = "goto_end_cost";
  227. $exp_cost_matrix["-1m0|0m1"]["-1m1|1m1"] = "match_cost";
  228. $exp_cost_matrix["-1m0|0m1"]["-1m2|1m1"] = "match_cost";
  229. $exp_cost_matrix["-1m0|0m1"]["-1m1|1m0"] = "match_cost";
  230. $exp_cost_matrix["-1m0|0m1"]["-1m0|1m1"] = "match_cost";
  231. $exp_cost_matrix["-1m0|0m1"]["END"] = "goto_end_cost";
  232. $exp_cost_matrix["1m0|1m1"]["END"] = "goto_end_cost";
  233. $exp_cost_matrix["0m1|2m0"]["END"] = "goto_end_cost";
  234. $exp_cost_matrix["1m0|-1m1"]["1m0|0m1"] = "match_cost";
  235. $exp_cost_matrix["1m0|-1m1"]["END"] = "goto_end_cost";
  236. $exp_cost_matrix["-1m1|1m1"]["0m1|2m0"] = "match_cost";
  237. $exp_cost_matrix["-1m1|1m1"]["END"] = "goto_end_cost";
  238. $exp_cost_matrix["-1m2|1m1"]["END"] = "goto_end_cost";
  239. $exp_cost_matrix["-1m1|1m0"]["0m1|1m1"] = "match_cost";
  240. $exp_cost_matrix["-1m1|1m0"]["0m1|1m0"] = "match_cost";
  241. $exp_cost_matrix["-1m1|1m0"]["0m0|1m1"] = "match_cost";
  242. $exp_cost_matrix["-1m1|1m0"]["END"] = "goto_end_cost";
  243. $exp_cost_matrix["-1m0|1m1"]["-1m1|2m0"] = "match_cost";
  244. $exp_cost_matrix["-1m0|1m1"]["END"] = "goto_end_cost";
  245. $exp_cost_matrix["-1m1|2m0"]["0m1|2m0"] = "match_cost";
  246. $exp_cost_matrix["-1m1|2m0"]["END"] = "goto_end_cost";
  247. $exp_cost_matrix["-1m0|2m0"]["END"] = "goto_end_cost";
  248. $exp_cost_matrix["-1m2|2m0"]["END"] = "goto_end_cost";
  249. $exp_cost_matrix["0m0|2m0"]["END"] = "goto_end_cost";
  250. $exp_cost_matrix["1m0|-1m0"]["END"] = "goto_end_cost";
  251. $exp_cost_matrix["1m0|-1m2"]["END"] = "goto_end_cost";
  252. $exp_cost_matrix["1m0|0m0"]["END"] = "goto_end_cost";
  253. $exp_cost_matrix["1m0|0m1"]["END"] = "goto_end_cost";
  254. $exp_cost_matrix["1m0|0m2"]["END"] = "goto_end_cost";
  255. $exp_cost_matrix["1m0|1m0"]["END"] = "goto_end_cost";
  256. $exp_cost_matrix["1m0|2m0"]["END"] = "goto_end_cost";
  257. $this->assertCostMatrixEquals(
  258. $exp_cost_matrix,
  259. $this->aligner->cost_matrix,
  260. "Cost matrix was wrong."
  261. );
  262. }
  263. /**
  264. * @group multilingual
  265. */
  266. public function testParseNodeID()
  267. {
  268. $this->assertParseNodeIDYields(
  269. '3m1|5m1',
  270. [3, 'm', 1, 5, 'm', 1],
  271. "Parsed node ID info was wrong for case where sentences are matched."
  272. );
  273. $this->assertParseNodeIDYields(
  274. '3m1|5m0',
  275. [3, 'm', 1, 5, 'm', 0],
  276. "Parsed node ID info was wrong for case where sentences were skipped."
  277. );
  278. $this->assertParseNodeIDYields(
  279. '-1m1|-1m1',
  280. [-1, 'm', 1, -1, 'm', 1],
  281. "Parsed node ID info was wrong for case with sentence number = -1 (i.e., cursor before first sentences on both sides)."
  282. );
  283. $this->assertParseNodeIDYields(
  284. '-1n0|-1n0',
  285. [-1, 'n', 0, -1, 'n', 0],
  286. "Parsed node ID info was wrong for START node '-1n0|-1n0'."
  287. );
  288. }
  289. /**
  290. * @group multilingual
  291. */
  292. public function testGenerateNodeID()
  293. {
  294. $this->setupSegmentedSentences();
  295. $this->assertEquals(
  296. '0m1|0m0',
  297. $this->aligner->_generate_node_ID(0, 'm', 1, 0, 'm', 0)
  298. );
  299. $this->assertEquals(
  300. '1m0|1m1',
  301. $this->aligner->_generate_node_ID(1, 'm', 2, 1, 'm', 1),
  302. "Node ID should never go passed the last L1 or L2 sentence number"
  303. );
  304. $this->assertEquals(
  305. '-1n0|-1n0',
  306. $this->aligner->_generate_node_ID(-1, 'n', 0, -1, 'n', 0),
  307. "Node ID was wrong for START node '-1n0|-1n0'."
  308. );
  309. }
  310. /**
  311. * @group multilingual
  312. */
  313. public function testSentencesAtThisNode()
  314. {
  315. $this->assertSentencesAtThisNode(
  316. '3m1|5m1',
  317. [4, 6],
  318. "Current sentences were wrong for node with matches on both sides."
  319. );
  320. $this->assertSentencesAtThisNode(
  321. '-1m1|-1m1',
  322. [0, 0],
  323. "Current sentences were wrong for initial nodes (i.e., sentence number = -1)"
  324. );
  325. $this->assertSentencesAtThisNode(
  326. '4m1|5m0',
  327. [5, 5],
  328. "Current sentences were wrong for case where we skip a sentence."
  329. );
  330. $this->assertSentencesAtThisNode(
  331. '-1n0|-1n0',
  332. [-1, -1],
  333. "Current sentences were wrong for START node '-1n0|-1n0'."
  334. );
  335. }
  336. /**
  337. * @group multilingual
  338. */
  339. public function testSentencesPrecedingThisNode()
  340. {
  341. $node = '3m1|5m1';
  342. $sentences_preceding_node = $this->aligner->_sentences_preceding_this_node($node);
  343. $this->assertEquals(
  344. [3, 5],
  345. $sentences_preceding_node,
  346. "Sentences preceding node '$node' were wrong."
  347. );
  348. }
  349. /**
  350. * @group multilingual
  351. */
  352. public function testComputeNodeTransitionCost()
  353. {
  354. $this->setupSegmentedSentences();
  355. $this->assertComputeNodeTransitionCostYields(
  356. "0m1|0m1",
  357. 0,
  358. "Transition cost failed for 1 to 1 match"
  359. );
  360. $this->assertComputeNodeTransitionCostYields(
  361. "0m1|0m2",
  362. 1.29,
  363. "Transition cost failed for 1 to 2 match"
  364. );
  365. $this->assertComputeNodeTransitionCostYields(
  366. "0m2|0m1",
  367. 0.58,
  368. "Transition cost failed for 2 to 1 match"
  369. );
  370. $this->assertComputeNodeTransitionCostYields(
  371. "0m1|0m0",
  372. 1,
  373. "Transition cost failed for L1 side skip"
  374. );
  375. $this->assertComputeNodeTransitionCostYields(
  376. "0m0|0m1",
  377. 1,
  378. "Transition cost failed for L2 side skip"
  379. );
  380. }
  381. ////////////////////////////////////////////////////////////////
  382. // Helper methods
  383. ////////////////////////////////////////////////////////////////
  384. private function assertSentenceLengthDeltaIs($l1_sentence, $l2_sentence, $exp_delta, $message)
  385. {
  386. $got_delta = $this->aligner->_sentence_length_delta($l1_sentence, $l2_sentence);
  387. $message = $message . "\nSentence length delta was wrong.";
  388. $this->assertEqualsWithDelta($exp_delta, $got_delta, 0.001, $message);
  389. }
  390. private function setupSegmentedSentences()
  391. {
  392. $en_entences = "Hello earthlings. Take me to your leader.";
  393. $fr_sentences = "Bonjour terriens. Inutile de résister. Amenez moi à votre chef.";
  394. $this->aligner->_segment_parallel_texts_to_sentences($en_entences, $fr_sentences);
  395. }
  396. private function assertParseNodeIDYields($node_id, $exp_parsed_info, $message)
  397. {
  398. // print "-- assert_parse_node_ID_yields: \$node_id=$node_id\n";
  399. $parsed_info = $this->aligner->_parse_node_ID($node_id);
  400. $this->assertEquals($exp_parsed_info, $parsed_info, "$message\nParsed info was wrong for node ID: '$node_id'");
  401. }
  402. private function assertSentencesAtThisNode($node_id, $exp_next_sentences, $message)
  403. {
  404. $next_sentences = $this->aligner->_sentences_at_this_node($node_id);
  405. $this->assertEquals(
  406. $exp_next_sentences,
  407. $next_sentences,
  408. $message . "\nNext sentences were wrong for node '$node_id'"
  409. );
  410. }
  411. private function assertCostMatrixEquals($exp_cost_matrix, $got_cost_matrix, $message)
  412. {
  413. // print "-- assertCostMatrixEquals: \$exp_cost_matrix=\n";var_dump($exp_cost_matrix);print"\n";
  414. // print "-- assertCostMatrixEquals: \$got_cost_matrix=\n";var_dump($got_cost_matrix);print"\n";
  415. $exp_origins = array_keys($exp_cost_matrix);
  416. sort($exp_origins);
  417. $got_origins = array_keys($got_cost_matrix);
  418. sort($got_origins);
  419. // print "-- assertCostMatrixEquals: \$exp_origins=\n";var_dump($exp_origins);print"\n";
  420. // print "-- assertCostMatrixEquals: \$got_origins=\n";var_dump($got_origins);print"\n";
  421. $this->assertEquals(
  422. $exp_origins,
  423. $got_origins,
  424. "List of origins in cost matrix differed."
  425. );
  426. foreach (array_keys($exp_cost_matrix) as $origin) {
  427. // print "-- assertCostMatrixEquals: \$exp_cost_matrix[$origin]=";var_dump($exp_cost_matrix[$origin]);print"\n";
  428. // print "-- assertCostMatrixEquals: \$got_cost_matrix[$origin]=";var_dump($got_cost_matrix[$origin]);print"\n";
  429. $this->assertEquals(
  430. $exp_cost_matrix[$origin],
  431. $got_cost_matrix[$origin],
  432. "Costs from origin $origin differed"
  433. );
  434. }
  435. }
  436. public function assertComputeNodeTransitionCostYields(
  437. $destination_node,
  438. $exp_cost,
  439. $message
  440. ) {
  441. $got_cost = $this->aligner->_compute_node_transition_cost($destination_node);
  442. $tolerance = 0.01;
  443. $this->assertEqualsWithDelta(
  444. $exp_cost,
  445. $got_cost,
  446. $tolerance,
  447. $message . "\nTransition cost to node '$destination_node' was wrong"
  448. );
  449. }
  450. }