/tools/rebuildFullTextIndex.php

https://github.com/clyfe/DEXonline · PHP · 117 lines · 97 code · 17 blank · 3 comment · 20 complexity · 1a9d43982e37566738b4ce1e72016903 MD5 · raw file

  1. <?php
  2. require_once('../phplib/util.php');
  3. ini_set('max_execution_time', '3600');
  4. ini_set('memory_limit', '256M');
  5. assert_options(ASSERT_BAIL, 1);
  6. log_scriptLog('Running rebuildFullTextIndex.php.');
  7. if (!Lock::acquire(LOCK_FULL_TEXT_INDEX)) {
  8. OS::errorAndExit('Lock already exists!');
  9. exit;
  10. }
  11. log_scriptLog("Clearing table FullTextIndex.");
  12. mysql_query('delete from FullTextIndex');
  13. $ifMap = array();
  14. $dbResult = mysql_query('select id, internalRep from Definition where status = 0');
  15. $numDefs = mysql_num_rows($dbResult);
  16. $defsSeen = 0;
  17. $indexSize = 0;
  18. $fileName = tempnam('/tmp', 'index_');
  19. $handle = fopen($fileName, 'w');
  20. log_scriptLog("Writing index to file $fileName.");
  21. debug_init();
  22. debug_off();
  23. while (($dbRow = mysql_fetch_row($dbResult)) != null) {
  24. $words = extractWords($dbRow[1]);
  25. foreach ($words as $position => $word) {
  26. if (StringUtil::isStopWord($word, true)) {
  27. // Nothing, this word is ignored.
  28. } else {
  29. if (!array_key_exists($word, $ifMap)) {
  30. cacheWordForm($word);
  31. }
  32. if (array_key_exists($word, $ifMap)) {
  33. $lexemList = preg_split('/,/', $ifMap[$word]);
  34. for ($i = 0; $i < count($lexemList); $i += 2) {
  35. fwrite($handle, $lexemList[$i] . "\t" . $lexemList[$i + 1] . "\t" . $dbRow[0] . "\t" . $position . "\n");
  36. $indexSize++;
  37. }
  38. } else {
  39. // print "Not found: $word\n";
  40. }
  41. }
  42. }
  43. if (++$defsSeen % 10000 == 0) {
  44. $runTime = debug_getRunningTimeInMillis() / 1000;
  45. $speed = round($defsSeen / $runTime);
  46. log_scriptLog("$defsSeen of $numDefs definitions indexed ($speed defs/sec). " .
  47. "Word map has " . count($ifMap) . " entries. " .
  48. "Memory used: " . round(memory_get_usage() / 1048576, 1) . " MB.");
  49. }
  50. }
  51. fclose($handle);
  52. log_scriptLog("$defsSeen of $numDefs definitions indexed.");
  53. log_scriptLog("Index size: $indexSize entries.");
  54. OS::executeAndAssert("chmod 666 $fileName");
  55. log_scriptLog("Importing file $fileName into table FullTextIndex");
  56. if (!mysql_query("load data local infile '$fileName' into table FullTextIndex")) {
  57. OS::errorAndExit("MySQL says: " . mysql_error());
  58. }
  59. util_deleteFile($fileName);
  60. if (!Lock::release(LOCK_FULL_TEXT_INDEX)) {
  61. log_scriptLog('WARNING: could not release lock!');
  62. }
  63. log_scriptLog('rebuildFullTextIndex.php completed successfully ' .
  64. '(against all odds)');
  65. /***************************************************************************/
  66. function extractWords($text) {
  67. $alphabet = 'abcdefghijklmnopqrstuvwxyzăâîșț';
  68. $text = mb_strtolower($text);
  69. $text = AdminStringUtil::removeAccents($text);
  70. $result = array();
  71. $currentWord = '';
  72. $chars = AdminStringUtil::unicodeExplode($text);
  73. foreach ($chars as $c) {
  74. if (strpos($alphabet, $c) !== false) {
  75. $currentWord .= $c;
  76. } else {
  77. if ($currentWord) {
  78. $result[] = $currentWord;
  79. }
  80. $currentWord = '';
  81. }
  82. }
  83. if ($currentWord) {
  84. $result[] = $currentWord;
  85. }
  86. return $result;
  87. }
  88. function cacheWordForm($word) {
  89. global $ifMap;
  90. $dbResult = mysql_query("select lexemId, inflectionId from InflectedForm where formNoAccent = '$word'");
  91. $value = '';
  92. while (($dbRow = mysql_fetch_assoc($dbResult)) != null) {
  93. $value .= ',' . $dbRow['lexemId'] . ',' . $dbRow['inflectionId'];
  94. }
  95. mysql_free_result($dbResult);
  96. if ($value) {
  97. $ifMap[$word] = substr($value, 1);
  98. }
  99. }
  100. ?>