PageRenderTime 48ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/phpBB/develop/generate_utf_tables.php

https://github.com/VSEphpbb/phpbb
PHP | 575 lines | 379 code | 64 blank | 132 comment | 43 complexity | 104f844e031edf66a7fed3dd7748c878 MD5 | raw file
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. if (php_sapi_name() != 'cli')
  14. {
  15. die("This program must be run from the command line.\n");
  16. }
  17. //
  18. // Security message:
  19. //
  20. // This script is potentially dangerous.
  21. // Remove or comment the next line (die(".... ) to enable this script.
  22. // Do NOT FORGET to either remove this script or disable it after you have used it.
  23. //
  24. die("Please read the first lines of this script for instructions on how to enable it");
  25. set_time_limit(0);
  26. define('IN_PHPBB', true);
  27. $phpbb_root_path = '../';
  28. $phpEx = substr(strrchr(__FILE__, '.'), 1);
  29. echo "Checking for required files\n";
  30. download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');
  31. download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');
  32. download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
  33. echo "\n";
  34. require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
  35. $file_contents = array();
  36. /**
  37. * Generate some Hangul/Jamo stuff
  38. */
  39. echo "\nGenerating Hangul and Jamo tables\n";
  40. for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)
  41. {
  42. $utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);
  43. $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
  44. $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;
  45. }
  46. for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)
  47. {
  48. $utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);
  49. $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
  50. $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;
  51. }
  52. for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)
  53. {
  54. $utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);
  55. $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;
  56. $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;
  57. }
  58. /**
  59. * Load the CompositionExclusions table
  60. */
  61. echo "Loading CompositionExclusion\n";
  62. $fp = fopen('CompositionExclusions.txt', 'rt');
  63. $exclude = array();
  64. while (!feof($fp))
  65. {
  66. $line = fgets($fp, 1024);
  67. if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
  68. {
  69. continue;
  70. }
  71. $cp = strtok($line, ' ');
  72. if ($pos = strpos($cp, '..'))
  73. {
  74. $start = hexdec(substr($cp, 0, $pos));
  75. $end = hexdec(substr($cp, $pos + 2));
  76. for ($i = $start; $i < $end; ++$i)
  77. {
  78. $exclude[$i] = 1;
  79. }
  80. }
  81. else
  82. {
  83. $exclude[hexdec($cp)] = 1;
  84. }
  85. }
  86. fclose($fp);
  87. /**
  88. * Load QuickCheck tables
  89. */
  90. echo "Generating QuickCheck tables\n";
  91. $fp = fopen('DerivedNormalizationProps.txt', 'rt');
  92. while (!feof($fp))
  93. {
  94. $line = fgets($fp, 1024);
  95. if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
  96. {
  97. continue;
  98. }
  99. $p = array_map('trim', explode(';', strtok($line, '#')));
  100. /**
  101. * Capture only NFC_QC, NFKC_QC
  102. */
  103. if (!preg_match('#^NFK?C_QC$#', $p[1]))
  104. {
  105. continue;
  106. }
  107. if ($pos = strpos($p[0], '..'))
  108. {
  109. $start = hexdec(substr($p[0], 0, $pos));
  110. $end = hexdec(substr($p[0], $pos + 2));
  111. }
  112. else
  113. {
  114. $start = $end = hexdec($p[0]);
  115. }
  116. if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)
  117. {
  118. /**
  119. * We do not store Hangul syllables in the array
  120. */
  121. continue;
  122. }
  123. if ($p[2] == 'M')
  124. {
  125. $val = UNICODE_QC_MAYBE;
  126. }
  127. else
  128. {
  129. $val = UNICODE_QC_NO;
  130. }
  131. if ($p[1] == 'NFKC_QC')
  132. {
  133. $file = 'utf_nfkc_qc';
  134. }
  135. else
  136. {
  137. $file = 'utf_nfc_qc';
  138. }
  139. for ($i = $start; $i <= $end; ++$i)
  140. {
  141. /**
  142. * The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php
  143. */
  144. $file_contents[$file][$file][cp_to_utf($i)] = $val;
  145. }
  146. }
  147. fclose($fp);
  148. /**
  149. * Do mappings
  150. */
  151. echo "Loading Unicode decomposition mappings\n";
  152. $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
  153. $map = array();
  154. while (!feof($fp))
  155. {
  156. $p = explode(';', fgets($fp, 1024));
  157. $cp = hexdec($p[0]);
  158. if (!empty($p[3]))
  159. {
  160. /**
  161. * Store combining class > 0
  162. */
  163. $file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];
  164. }
  165. if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))
  166. {
  167. continue;
  168. }
  169. if (strpos($p[5], '>'))
  170. {
  171. $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
  172. }
  173. else
  174. {
  175. $map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
  176. }
  177. }
  178. fclose($fp);
  179. /**
  180. * Build the canonical composition table
  181. */
  182. echo "Generating the Canonical Composition table\n";
  183. foreach ($map['NFD'] as $cp => $decomp_seq)
  184. {
  185. if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))
  186. {
  187. /**
  188. * Singletons are excluded from canonical composition
  189. */
  190. continue;
  191. }
  192. $utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
  193. if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))
  194. {
  195. $file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);
  196. }
  197. }
  198. /**
  199. * Decompose the NF[K]D mappings recursively and prepare the file contents
  200. */
  201. echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
  202. foreach ($map as $type => $decomp_map)
  203. {
  204. foreach ($decomp_map as $cp => $decomp_seq)
  205. {
  206. $decomp_map[$cp] = decompose($decomp_map, $decomp_seq);
  207. }
  208. unset($decomp_seq);
  209. if ($type == 'NFKD')
  210. {
  211. $file = 'utf_compatibility_decomp';
  212. $var = 'utf_compatibility_decomp';
  213. }
  214. else
  215. {
  216. $file = 'utf_canonical_decomp';
  217. $var = 'utf_canonical_decomp';
  218. }
  219. /**
  220. * Generate the corresponding file
  221. */
  222. foreach ($decomp_map as $cp => $decomp_seq)
  223. {
  224. $file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
  225. }
  226. }
  227. /**
  228. * Generate and/or alter the files
  229. */
  230. foreach ($file_contents as $file => $contents)
  231. {
  232. /**
  233. * Generate a new file
  234. */
  235. echo "Writing to $file.$phpEx\n";
  236. if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))
  237. {
  238. trigger_error('Cannot open ' . $file . ' for write');
  239. }
  240. fwrite($fp, '<?php');
  241. foreach ($contents as $var => $val)
  242. {
  243. fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");
  244. }
  245. fclose($fp);
  246. }
  247. echo "\n*** UTF-8 normalization tables done\n\n";
  248. /**
  249. * Now we'll generate the files needed by the search indexer
  250. */
  251. echo "Generating search indexer tables\n";
  252. $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
  253. $map = array();
  254. while ($line = fgets($fp, 1024))
  255. {
  256. /**
  257. * The current line is split, $m[0] hold the codepoint in hexadecimal and
  258. * all other fields numbered as in http://www.unicode.org/Public/UNIDATA/UCD.html#UnicodeData.txt
  259. */
  260. $m = explode(';', $line);
  261. /**
  262. * @var integer $cp Current char codepoint
  263. * @var string $utf_char UTF-8 representation of current char
  264. */
  265. $cp = hexdec($m[0]);
  266. $utf_char = cp_to_utf($cp);
  267. /**
  268. * $m[2] holds the "General Category" of the character
  269. * @link http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
  270. */
  271. switch ($m[2][0])
  272. {
  273. case 'L':
  274. /**
  275. * We allow all letters and map them to their lowercased counterpart on the fly
  276. */
  277. $map_to_hex = (isset($m[13][0])) ? $m[13] : $m[0];
  278. if (preg_match('#^LATIN.*(?:LETTER|LIGATURE) ([A-Z]{2}(?![A-Z]))$#', $m[1], $capture))
  279. {
  280. /**
  281. * Special hack for some latin ligatures. Using the name of a character
  282. * is bad practice, but for now it works well enough.
  283. *
  284. * @todo Note that ligatures with combining marks such as U+01E2 are
  285. * not supported at this time
  286. */
  287. $map[$cp] = strtolower($capture[1]);
  288. }
  289. else if (isset($m[13][0]))
  290. {
  291. /**
  292. * If the letter has a lowercased form, use it
  293. */
  294. $map[$cp] = hex_to_utf($m[13]);
  295. }
  296. else
  297. {
  298. /**
  299. * In all other cases, map the letter to itself
  300. */
  301. $map[$cp] = $utf_char;
  302. }
  303. break;
  304. case 'M':
  305. /**
  306. * We allow all marks, they are mapped to themselves
  307. */
  308. $map[$cp] = $utf_char;
  309. break;
  310. case 'N':
  311. /**
  312. * We allow all numbers, but we map them to their numeric value whenever
  313. * possible. The numeric value (field #8) is in ASCII already
  314. *
  315. * @todo Note that fractions such as U+00BD will be converted to something
  316. * like "1/2", with a slash. However, "1/2" entered in ASCII is converted
  317. * to "1 2". This will have to be fixed.
  318. */
  319. $map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char;
  320. break;
  321. default:
  322. /**
  323. * Everything else is ignored, skip to the next line
  324. */
  325. continue 2;
  326. }
  327. }
  328. fclose($fp);
  329. /**
  330. * Add some cheating
  331. */
  332. $cheats = array(
  333. '00DF' => 'ss', # German sharp S
  334. '00C5' => 'ae', # Capital A with diaeresis
  335. '00E4' => 'ae', # Small A with diaeresis
  336. '00D6' => 'oe', # Capital O with diaeresis
  337. '00F6' => 'oe', # Small O with diaeresis
  338. '00DC' => 'ue', # Capital U with diaeresis
  339. '00FC' => 'ue', # Small U with diaeresis
  340. );
  341. /**
  342. * Add our "cheat replacements" to the map
  343. */
  344. foreach ($cheats as $hex => $map_to)
  345. {
  346. $map[hexdec($hex)] = $map_to;
  347. }
  348. /**
  349. * Split the map into smaller blocks
  350. */
  351. $file_contents = array();
  352. foreach ($map as $cp => $map_to)
  353. {
  354. $file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to;
  355. }
  356. unset($map);
  357. foreach ($file_contents as $idx => $contents)
  358. {
  359. echo "Writing to search_indexer_$idx.$phpEx\n";
  360. $fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb');
  361. fwrite($fp, '<?php return ' . my_var_export($contents) . ';');
  362. fclose($fp);
  363. }
  364. echo "\n*** Search indexer tables done\n\n";
  365. die("\nAll done!\n");
  366. ////////////////////////////////////////////////////////////////////////////////
  367. // Internal functions //
  368. ////////////////////////////////////////////////////////////////////////////////
  369. /**
  370. * Decompose a sequence recusively
  371. *
  372. * @param array $decomp_map Decomposition mapping, passed by reference
  373. * @param string $decomp_seq Decomposition sequence as decimal codepoints separated with a space
  374. * @return string Decomposition sequence, fully decomposed
  375. */
  376. function decompose(&$decomp_map, $decomp_seq)
  377. {
  378. $ret = array();
  379. foreach (explode(' ', $decomp_seq) as $cp)
  380. {
  381. if (isset($decomp_map[$cp]))
  382. {
  383. $ret[] = decompose($decomp_map, $decomp_map[$cp]);
  384. }
  385. else
  386. {
  387. $ret[] = $cp;
  388. }
  389. }
  390. return implode(' ', $ret);
  391. }
  392. /**
  393. * Return a parsable string representation of a variable
  394. *
  395. * This is function is limited to array/strings/integers
  396. *
  397. * @param mixed $var Variable
  398. * @return string PHP code representing the variable
  399. */
  400. function my_var_export($var)
  401. {
  402. if (is_array($var))
  403. {
  404. $lines = array();
  405. foreach ($var as $k => $v)
  406. {
  407. $lines[] = my_var_export($k) . '=>' . my_var_export($v);
  408. }
  409. return 'array(' . implode(',', $lines) . ')';
  410. }
  411. else if (is_string($var))
  412. {
  413. return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";
  414. }
  415. else
  416. {
  417. return $var;
  418. }
  419. }
  420. /**
  421. * Download a file to the develop/ dir
  422. *
  423. * @param string $url URL of the file to download
  424. * @return null
  425. */
  426. function download($url)
  427. {
  428. global $phpbb_root_path;
  429. if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
  430. {
  431. return;
  432. }
  433. echo 'Downloading from ', $url, ' ';
  434. if (!$fpr = fopen($url, 'rb'))
  435. {
  436. die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
  437. }
  438. if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
  439. {
  440. die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
  441. }
  442. $i = 0;
  443. $chunk = 32768;
  444. $done = '';
  445. while (!feof($fpr))
  446. {
  447. $i += fwrite($fpw, fread($fpr, $chunk));
  448. echo str_repeat("\x08", strlen($done));
  449. $done = ($i >> 10) . ' KiB';
  450. echo $done;
  451. }
  452. fclose($fpr);
  453. fclose($fpw);
  454. echo "\n";
  455. }
  456. /**
  457. * Convert a codepoint in hexadecimal to a UTF-8 char
  458. *
  459. * @param string $hex Codepoint, in hexadecimal
  460. * @return string UTF-8 char
  461. */
  462. function hex_to_utf($hex)
  463. {
  464. return cp_to_utf(hexdec($hex));
  465. }
  466. /**
  467. * Return a UTF string formed from a sequence of codepoints in hexadecimal
  468. *
  469. * @param string $seq Sequence of codepoints, separated with a space
  470. * @return string UTF-8 string
  471. */
  472. function hexseq_to_utf($seq)
  473. {
  474. return implode('', array_map('hex_to_utf', explode(' ', $seq)));
  475. }
  476. /**
  477. * Convert a codepoint to a UTF-8 char
  478. *
  479. * @param integer $cp Unicode codepoint
  480. * @return string UTF-8 string
  481. */
  482. function cp_to_utf($cp)
  483. {
  484. if ($cp > 0xFFFF)
  485. {
  486. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  487. }
  488. else if ($cp > 0x7FF)
  489. {
  490. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  491. }
  492. else if ($cp > 0x7F)
  493. {
  494. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  495. }
  496. else
  497. {
  498. return chr($cp);
  499. }
  500. }