PageRenderTime 37ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/bitrix/modules/search/tools/language.php

https://bitbucket.org/alex_poluektov/itech_test
PHP | 566 lines | 544 code | 14 blank | 8 comment | 21 complexity | c918b96838ed0a581f4ae10dd93b23b4 MD5 | raw file
Possible License(s): Apache-2.0
  1. <?
  2. class CSearchLanguage
  3. {
  4. var $_abc = array();
  5. var $_lang_id;
  6. var $_lang_bigramm_cache;
  7. var $_trigrams = array();
  8. var $_has_bigramm_info = null;
  9. var $_bigrams = null;
  10. function __construct($lang_id)
  11. {
  12. $this->_lang_id = $lang_id;
  13. }
  14. //Function loads language class
  15. static function GetLanguage($sLang)
  16. {
  17. static $arLanguages = array();
  18. if(!isset($arLanguages[$sLang]))
  19. {
  20. $obLanguage = null;
  21. $class_name = strtolower("CSearchLanguage".$sLang);
  22. if(!class_exists($class_name))
  23. {
  24. //First try to load customized class
  25. $strDirName = $_SERVER["DOCUMENT_ROOT"].BX_PERSONAL_ROOT."/php_interface/".$sLang."/search";
  26. $strFileName = $strDirName."/language.php";
  27. if(file_exists($strFileName))
  28. $obLanguage = @include($strFileName);
  29. if(!is_object($obLanguage))
  30. {
  31. if(!class_exists($class_name))
  32. {
  33. //Then module class
  34. $strDirName = $_SERVER["DOCUMENT_ROOT"]."/bitrix/modules/search/tools/".$sLang;
  35. $strFileName = $strDirName."/language.php";
  36. if(file_exists($strFileName))
  37. @include($strFileName);
  38. if(!class_exists($class_name))
  39. {
  40. $class_name = "CSearchLanguage";
  41. }
  42. }
  43. }
  44. }
  45. if(!is_object($obLanguage))
  46. $obLanguage = new $class_name($sLang);
  47. $obLanguage->LoadTrigrams($strDirName);
  48. $arStemInfo = stemming_init($sLang);
  49. if(is_array($arStemInfo))
  50. $obLanguage->_abc = array_flip($obLanguage->StrToArray($arStemInfo["abc"]));
  51. $obLanguage->_has_bigramm_info = is_callable(array($obLanguage, "getbigrammletterfreq"));
  52. $arLanguages[$sLang] = $obLanguage;
  53. }
  54. return $arLanguages[$sLang];
  55. }
  56. //Reads file with trigrams (combinations not allowed in the words)
  57. function LoadTrigrams($dir_name)
  58. {
  59. if(empty($this->_trigrams))
  60. {
  61. $file_name = $dir_name."/trigram";
  62. if(file_exists($file_name) && is_file($file_name))
  63. {
  64. $cache_id = filemtime($file_name).",".$file_name;
  65. $obCache = new CPHPCache;
  66. if($obCache->StartDataCache(360000, $cache_id, "search"))
  67. {
  68. $text = file_get_contents($file_name);
  69. $ar = explode("\n", $text);
  70. foreach($ar as $trigramm)
  71. {
  72. if(strlen($trigramm) == 3)
  73. {
  74. $strScanCodesTmp = $this->ConvertToScancode($trigramm, false, true);
  75. if(strlen($strScanCodesTmp) == 3)
  76. {
  77. $this->_trigrams[$strScanCodesTmp] = true;
  78. }
  79. }
  80. }
  81. $obCache->EndDataCache($this->_trigrams);
  82. }
  83. else
  84. {
  85. $this->_trigrams = $obCache->GetVars();
  86. }
  87. }
  88. }
  89. }
  90. function HasTrigrams()
  91. {
  92. return !empty($this->_trigrams);
  93. }
  94. //Check phrase against trigrams
  95. function CheckTrigrams($arScanCodes)
  96. {
  97. $result = 0;
  98. $check = "";
  99. $len = 0;
  100. foreach($arScanCodes as $i => $code)
  101. {
  102. if($code === false) //new word starts here
  103. {
  104. $check = "";
  105. $len = 0;
  106. }
  107. else
  108. {
  109. //running window of 3 bytes
  110. if($len < 3)
  111. {
  112. $check .= chr($code+1);
  113. $len++;
  114. }
  115. else
  116. {
  117. $check = $check[1].$check[2].chr($code+1);
  118. $len = 3;
  119. }
  120. }
  121. if($len >= 3)
  122. {
  123. if(isset($this->_trigrams[$check]))
  124. $result++;
  125. }
  126. }
  127. return $result;
  128. }
  129. //This function returns positions of the letters
  130. //on the keyboard. This one is default English layout
  131. function GetKeyboardLayout()
  132. {
  133. return array(
  134. "lo" => "` - ".
  135. "qwertyuiop[]".
  136. "asdfghjkl;'".
  137. "zxcvbnm,. ",
  138. "hi" => "~ ".
  139. "QWERTYUIOP{}".
  140. "ASDFGHJKL:\"".
  141. "ZXCVBNM<> "
  142. );
  143. }
  144. function ConvertFromScancode($arScancode)
  145. {
  146. $result = "";
  147. $keyboard = $this->GetKeyboardLayout();
  148. foreach($arScancode as $code)
  149. $result .= substr($keyboard["lo"], $code, 1);
  150. return $result;
  151. }
  152. function StrToArray($str)
  153. {
  154. if(defined("BX_UTF"))
  155. {
  156. $result = array();
  157. $len = strlen($str);
  158. for($i = 0;$i < $len; $i++)
  159. $result[] = substr($str, $i, 1);
  160. return $result;
  161. }
  162. else
  163. {
  164. return str_split($str);
  165. }
  166. }
  167. //This function converts text between layouts
  168. static function ConvertKeyboardLayout($text, $from, $to)
  169. {
  170. static $keyboards = array();
  171. $combo = $from."|".$to;
  172. if(!isset($keyboards[$combo]))
  173. {
  174. //Fill local cache
  175. if(!array_key_exists($from, $keyboards))
  176. {
  177. $ob = CSearchLanguage::GetLanguage($from);
  178. $keyboard = $ob->GetKeyboardLayout();
  179. if(is_array($keyboard))
  180. $keyboards[$from] = array_merge($ob->StrToArray($keyboard["lo"]), $ob->StrToArray($keyboard["hi"]));
  181. else
  182. $keyboards[$from] = null;
  183. }
  184. if(!array_key_exists($to, $keyboards))
  185. {
  186. $ob = CSearchLanguage::GetLanguage($to);
  187. $keyboard = $ob->GetKeyboardLayout();
  188. if(is_array($keyboard))
  189. $keyboards[$to] = array_merge($ob->StrToArray($keyboard["lo"]), $ob->StrToArray($keyboard["hi"]));
  190. else
  191. $keyboards[$to] = null;
  192. }
  193. //when both layouts defined
  194. if(isset($keyboards[$from]) && isset($keyboards[$to]))
  195. {
  196. $keyboards[$combo] = array();
  197. foreach($keyboards[$from] as $i => $ch)
  198. if($ch != false)
  199. $keyboards[$combo][$ch] = $keyboards[$to][$i];
  200. }
  201. }
  202. if(isset($keyboards[$combo]))
  203. return strtr($text, $keyboards[$combo]);
  204. else
  205. return $text;
  206. }
  207. //This function converts text into array of character positions
  208. //on the keyboard. Not defined chars turns into "false" value.
  209. function ConvertToScancode($text, $strict=false, $binary=false)
  210. {
  211. static $cache = array();
  212. if(!isset($cache[$this->_lang_id]))
  213. {
  214. $cache[$this->_lang_id] = array();
  215. $keyboard = $this->GetKeyboardLayout();
  216. foreach($this->StrToArray($keyboard["lo"]) as $pos => $ch)
  217. $cache[$this->_lang_id][$ch] = $pos;
  218. foreach($this->StrToArray($keyboard["hi"]) as $pos => $ch)
  219. $cache[$this->_lang_id][$ch] = $pos;
  220. }
  221. $scancodes = &$cache[$this->_lang_id];
  222. if($binary)
  223. {
  224. $result = "";
  225. foreach($this->StrToArray($text) as $ch)
  226. {
  227. if(
  228. isset($scancodes[$ch])
  229. && !($ch === " ")
  230. && !($strict && !isset($this->_abc[$ch]))
  231. )
  232. $result .= chr($scancodes[$ch]+1);
  233. }
  234. }
  235. else
  236. {
  237. $result = array();
  238. foreach($this->StrToArray($text) as $ch)
  239. {
  240. if($ch === " ")
  241. $result[] = false;
  242. elseif($strict && !isset($this->_abc[$ch]))
  243. $result[] = false;
  244. elseif(isset($scancodes[$ch]))
  245. $result[] = $scancodes[$ch];
  246. else
  247. $result[] = false;
  248. }
  249. }
  250. return $result;
  251. }
  252. function PreGuessLanguage($text, $lang=false)
  253. {
  254. //Indicates that there is no own guess
  255. return false;
  256. //In subclasses you should return array("from" => lang, "to" => lang) to translate
  257. //or return true when no translation nedded
  258. //or parent::GuessLanguage for futher processing
  259. }
  260. public static function GuessLanguage($text, $lang=false)
  261. {
  262. if(strlen($text) <= 0)
  263. return false;
  264. static $cache = array();
  265. if(empty($cache))
  266. {
  267. $cache[] = "en";//English is always in mind and on the first place
  268. $rsLanguages = CLanguage::GetList(($b=""), ($o=""));
  269. while($arLanguage = $rsLanguages->Fetch())
  270. if($arLanguage["LID"] != "en")
  271. $cache[] = $arLanguage["LID"];
  272. }
  273. if(is_array($lang))
  274. $arLanguages = $lang;
  275. else
  276. $arLanguages = $cache;
  277. if(count($arLanguages) < 2)
  278. return false;
  279. $languages_from = array();
  280. $max_len = 0;
  281. //Give customized languages a chance to guess
  282. foreach($arLanguages as $lang)
  283. {
  284. $ob = CSearchLanguage::GetLanguage($lang);
  285. $res = $ob->PreGuessLanguage($text, $lang);
  286. if(is_array($res))
  287. return $res;
  288. elseif($res === true)
  289. return false;
  290. }
  291. //First try to detect language which
  292. //was used to type the phrase
  293. foreach($arLanguages as $lang)
  294. {
  295. $ob = CSearchLanguage::GetLanguage($lang);
  296. $arScanCodesTmp1 = $ob->ConvertToScancode($text, true);
  297. $arScanCodesTmp2_cnt = count(array_filter($arScanCodesTmp1));
  298. //It will be one with most converted chars
  299. if($arScanCodesTmp2_cnt > $max_len)
  300. {
  301. $max_len = $arScanCodesTmp2_cnt;
  302. $languages_from = array($lang => $arScanCodesTmp1);
  303. }
  304. elseif($arScanCodesTmp2_cnt == $max_len)
  305. {
  306. $languages_from[$lang] = $arScanCodesTmp1;
  307. }
  308. }
  309. if($max_len < 2)
  310. return false;
  311. if(count($languages_from) <= 0)
  312. return false;
  313. //If more than one language is detected as input
  314. //try to get one with best trigram info
  315. $arDetectionFrom = array();
  316. $i = 0;
  317. foreach($languages_from as $lang => $arScanCodes)
  318. {
  319. $arDetectionFrom[$lang] = array();
  320. $ob = CSearchLanguage::GetLanguage($lang);
  321. $arDetectionFrom[$lang][] = $ob->HasTrigrams();
  322. $arDetectionFrom[$lang][] = $ob->CheckTrigrams($arScanCodes);
  323. //Calculate how far sequence of scan codes
  324. //is from language model
  325. //$deviation = $ob->GetDeviation($arScanCodes);
  326. //$arDetection[$lang_from_to][] = $deviation[1];
  327. //$arDetection[$lang_from_to][] = intval($deviation[0]*100);
  328. //Delay till compare
  329. $arDetectionFrom[$lang][] = $ob;
  330. $arDetectionFrom[$lang][] = $arScanCodes;
  331. $arDetectionFrom[$lang][] = $i;
  332. $i++;
  333. }
  334. uasort($arDetectionFrom, array("CSearchLanguage", "cmp"));
  335. //Now try the best to detect the language
  336. $arDetection = array();
  337. $i = 0;
  338. foreach($arDetectionFrom as $lang_from => $arTemp)
  339. {
  340. $arScanCodes = $languages_from[$lang_from];
  341. foreach($arLanguages as $lang)
  342. {
  343. $lang_from_to = $lang_from."=>".$lang;
  344. $arDetection[$lang_from_to] = array();
  345. $ob = CSearchLanguage::GetLanguage($lang);
  346. $arDetection[$lang_from_to][] = $ob->HasBigrammInfo();
  347. $arDetection[$lang_from_to][] = $ob->CheckTrigrams($arScanCodes);
  348. //Calculate how far sequence of scan codes
  349. //is from language model
  350. //$deviation = $ob->GetDeviation($arScanCodes);
  351. //$arDetection[$lang_from_to][] = $deviation[1];
  352. //$arDetection[$lang_from_to][] = intval($deviation[0]*100);
  353. //Delay till compare
  354. $arDetection[$lang_from_to][] = $ob;
  355. $arDetection[$lang_from_to][] = $arScanCodes;
  356. $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $lang_from, $lang);
  357. $arDetection[$lang_from_to][] = $alt_text !== $text;
  358. $arDetection[$lang_from_to][] = $i;
  359. $arDetection[$lang_from_to][] = $lang_from_to;
  360. $i++;
  361. }
  362. }
  363. uasort($arDetection, array("CSearchLanguage", "cmp"));
  364. $language_from_to = key($arDetection);
  365. list($language_from, $language_to) = explode("=>", $language_from_to);
  366. $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $language_from, $language_to);
  367. if($alt_text === $text)
  368. return false;
  369. return array("from" => $language_from, "to" => $language_to);
  370. }
  371. //Compare to results of text analysis
  372. static function cmp($a, $b)
  373. {
  374. if($a[0] && !$b[0]) //On first place we check if model supports bigrams check
  375. return -1;
  376. elseif($b[0] && !$a[0])
  377. return 1;
  378. else
  379. {
  380. $c = count($a);
  381. for($i = 1; $i < $c; $i++)
  382. {
  383. if($i == 2)
  384. {
  385. //Delayed deviation calculation
  386. if(is_object($a[2]))
  387. {
  388. $deviation = $a[2]->GetDeviation($a[3]);
  389. $a[2] = $deviation[1];
  390. if(count($a[3]) > 3)
  391. $a[3] = intval($deviation[0]*100);
  392. else
  393. $a[3] = 100;
  394. }
  395. if(is_object($b[2]))
  396. {
  397. $deviation = $b[2]->GetDeviation($b[3]);
  398. $b[2] = $deviation[1];
  399. if(count($b[3]) > 3)
  400. $b[3] = intval($deviation[0]*100);
  401. else
  402. $b[3] = 100;
  403. }
  404. }
  405. if($a[$i] < $b[$i])
  406. return -1;
  407. elseif($a[$i] > $b[$i])
  408. return 1;
  409. }
  410. return 0;//never happens
  411. }
  412. }
  413. //Function returns distance of the text (sequence of scan codes)
  414. //from language model
  415. function GetDeviation($arScanCodes)
  416. {
  417. //This is language model
  418. $lang_bigrams = $this->GetBigrammScancodeFreq();
  419. $lang_count = $lang_bigrams["count"];
  420. unset($lang_bigrams["count"]);
  421. //This is text model
  422. $text_bigrams = $this->ConvertToBigramms($arScanCodes);
  423. $count = $text_bigrams["count"];
  424. unset($text_bigrams["count"]);
  425. $deviation = 0;
  426. $zeroes = 0;
  427. foreach($text_bigrams as $key => $value)
  428. {
  429. if(!isset($lang_bigrams[$key]))
  430. {
  431. $zeroes++;
  432. $deviation += $value/$count;
  433. }
  434. else
  435. {
  436. //echo $this->ConvertFromScancode(explode(" ", $key)),"=",$lang_bigrams[$key]/$lang_count,"<br>";
  437. $deviation += abs($value/$count - $lang_bigrams[$key]/$lang_count);
  438. }
  439. }
  440. return array($deviation, $zeroes);
  441. }
  442. //Function returns bigramms of the text (array of scancodes)
  443. //For example "FAT RAT" will be
  444. //array("FA", "AT", "RA", "AT")
  445. //This is model of the text
  446. function ConvertToBigramms($arScancodes)
  447. {
  448. $result = array();
  449. $len = count($arScancodes)-1;
  450. for($i = 0; $i < $len; $i++)
  451. {
  452. $code1 = $arScancodes[$i];
  453. $code2 = $arScancodes[$i+1];
  454. if($code1 !== false && $code2 !== false)
  455. {
  456. $result["count"]++;
  457. $result[$code1." ".$code2]++;
  458. }
  459. }
  460. return $result;
  461. }
  462. function HasBigrammInfo()
  463. {
  464. return $this->_has_bigramm_info;
  465. }
  466. //Function returns model of the language
  467. function GetBigrammScancodeFreq()
  468. {
  469. if(!$this->HasBigrammInfo())
  470. return array("count"=>1);
  471. if(!isset($this->_lang_bigramm_cache))
  472. {
  473. $bigramms = $this->GetBigrammLetterFreq();
  474. $keyboard = $this->GetKeyboardLayout();
  475. $keyboard_lo = $keyboard["lo"];
  476. $keyboard_hi = $keyboard["hi"];
  477. $result = array();
  478. foreach($bigramms as $letter1 => $row)
  479. {
  480. $p1 = strpos($keyboard_lo, $letter1);
  481. if($p1 === false)
  482. $p1 = strpos($keyboard_hi, $letter1);
  483. $i = 0;
  484. foreach($bigramms as $letter2 => $tmp)
  485. {
  486. $p2 = strpos($keyboard_lo, $letter2);
  487. if($p2 === false)
  488. $p2 = strpos($keyboard_hi, $letter2);
  489. $weight = $row[$i];
  490. $result["count"] += $weight;
  491. $result[$p1." ".$p2] = $weight;
  492. $i++;
  493. }
  494. }
  495. $this->_lang_bigramm_cache = $result;
  496. }
  497. return $this->_lang_bigramm_cache;
  498. }
  499. }
  500. ?>