PageRenderTime 22ms CodeModel.GetById 0ms RepoModel.GetById 1ms app.codeStats 0ms

/phpBB/develop/generate_utf_confusables.php

https://github.com/VSEphpbb/phpbb
PHP | 245 lines | 178 code | 26 blank | 41 comment | 23 complexity | 8ac44e08128d2a54e35baa16f45a824b MD5 | raw file
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. if (php_sapi_name() != 'cli')
  14. {
  15. die("This program must be run from the command line.\n");
  16. }
  17. //
  18. // Security message:
  19. //
  20. // This script is potentially dangerous.
  21. // Remove or comment the next line (die(".... ) to enable this script.
  22. // Do NOT FORGET to either remove this script or disable it after you have used it.
  23. //
  24. die("Please read the first lines of this script for instructions on how to enable it");
  25. set_time_limit(0);
  26. define('IN_PHPBB', true);
  27. $phpbb_root_path = '../';
  28. $phpEx = substr(strrchr(__FILE__, '.'), 1);
  29. echo "Checking for required files\n";
  30. download('http://unicode.org/reports/tr39/data/confusables.txt');
  31. download('http://unicode.org/Public/UNIDATA/CaseFolding.txt');
  32. echo "\n";
  33. /**
  34. * Load the confusables table
  35. */
  36. echo "Loading confusables\n";
  37. $unidata = file_get_contents('confusables.txt');
  38. /**
  39. * Load the CaseFolding table
  40. */
  41. echo "Loading CaseFolding\n";
  42. $casefolds = file_get_contents('CaseFolding.txt');
  43. function utf8_chr($cp)
  44. {
  45. if ($cp > 0xFFFF)
  46. {
  47. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  48. }
  49. else if ($cp > 0x7FF)
  50. {
  51. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  52. }
  53. else if ($cp > 0x7F)
  54. {
  55. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  56. }
  57. else
  58. {
  59. return chr($cp);
  60. }
  61. }
  62. preg_match_all('/^([0-9A-F]+) ;\s((?:[0-9A-F]+ )*);.*?$/im', $unidata, $array, PREG_SET_ORDER);
  63. preg_match_all('/^([0-9A-F]+); ([CFS]); ([0-9A-F]+(?: [0-9A-F]+)*);/im', $casefolds, $casefold_array);
  64. // some that we defined ourselves
  65. $uniarray = array(
  66. "\xC2\xA1" => "\x69", // EXCLAMATION MARK, INVERTED => LATIN SMALL LETTER I
  67. "\xC7\x83" => "\x21", // LATIN LETTER RETROFLEX CLICK => EXCLAMATION MARK
  68. "\xCE\xB1" => "\x61", // GREEK SMALL LETTER ALPHA => LATIN SMALL LETTER A
  69. "\xE1\x9A\x80" => "\x20", // OGHAM SPACE MARK
  70. "\xC2\xAD" => '', // HYPHEN, SOFT => empty string
  71. "\xDB\x9D" => '', // ARABIC END OF AYAH
  72. "\xDC\x8F" => '', // SYRIAC ABBREVIATION MARK
  73. "\xE1\xA0\x86" => '', // MONGOLIAN TODO SOFT HYPHEN
  74. "\xE1\xA0\x8E" => '', // MONGOLIAN VOWEL SEPARATOR
  75. "\xE2\x80\x8B" => '', // ZERO WIDTH SPACE
  76. "\xE2\x80\x8C" => '', // ZERO WIDTH NON-JOINER
  77. "\xE2\x80\x8D" => '', // ZERO WIDTH JOINER
  78. "\xE2\x80\xA8" => '', // LINE SEPARATOR
  79. "\xE2\x80\xA9" => '', // PARAGRAPH SEPARATOR
  80. "\xE2\x81\xA0" => '', // WORD JOINER
  81. "\xE2\x81\xA1" => '', // FUNCTION APPLICATION
  82. "\xE2\x81\xA2" => '', // INVISIBLE TIMES
  83. "\xE2\x81\xA3" => '', // INVISIBLE SEPARATOR
  84. "\xE2\x81\xAA" => '', // [CONTROL CHARACTERS]
  85. "\xE2\x81\xAB" => '', // [CONTROL CHARACTERS]
  86. "\xE2\x81\xAC" => '', // [CONTROL CHARACTERS]
  87. "\xE2\x81\xAD" => '', // [CONTROL CHARACTERS]
  88. "\xE2\x81\xAE" => '', // [CONTROL CHARACTERS]
  89. "\xE2\x81\xAF" => '', // [CONTROL CHARACTERS]
  90. "\xEF\xBB\xBF" => '', // ZERO WIDTH NO-BREAK SPACE
  91. "\xEF\xBF\xB9" => '', // [CONTROL CHARACTERS]
  92. "\xEF\xBF\xBA" => '', // [CONTROL CHARACTERS]
  93. "\xEF\xBF\xBB" => '', // [CONTROL CHARACTERS]
  94. "\xEF\xBF\xBC" => '', // [CONTROL CHARACTERS]
  95. "\xF0\x9D\x85\xB3" => '', // [MUSICAL CONTROL CHARACTERS]
  96. "\xF0\x9D\x85\xB4" => '', // [MUSICAL CONTROL CHARACTERS]
  97. "\xF0\x9D\x85\xB5" => '', // [MUSICAL CONTROL CHARACTERS]
  98. "\xF0\x9D\x85\xB6" => '', // [MUSICAL CONTROL CHARACTERS]
  99. "\xF0\x9D\x85\xB7" => '', // [MUSICAL CONTROL CHARACTERS]
  100. "\xF0\x9D\x85\xB8" => '', // [MUSICAL CONTROL CHARACTERS]
  101. "\xF0\x9D\x85\xB9" => '', // [MUSICAL CONTROL CHARACTERS]
  102. "\xF0\x9D\x85\xBA" => '', // [MUSICAL CONTROL CHARACTERS]
  103. );
  104. $copy = $uniarray;
  105. /**
  106. * @todo we need to check that the $uniarray does not reverse any of the mappings defined in the unicode definition
  107. */
  108. foreach ($array as $value)
  109. {
  110. $temp_hold = implode(array_map('utf8_chr', array_map('hexdec', explode(' ', trim($value[2])))));
  111. if (isset($copy[utf8_chr(hexdec((string)$value[1]))]))
  112. {
  113. $num = '';
  114. $string = utf8_chr(hexdec((string)$value[1]));
  115. for ($i = 0; $i < strlen($string); $i++)
  116. {
  117. $num .= '\x' . str_pad(base_convert(ord($string[$i]), 10, 16), 2, '0', STR_PAD_LEFT);
  118. }
  119. echo $num . "\n";
  120. if ($uniarray[$string] != $temp_hold)
  121. {
  122. echo " --> $string\n";
  123. echo " --> " . $temp_hold . "\n";
  124. }
  125. }
  126. // do some tests for things that transform into something with the number one
  127. if (strpos($temp_hold, utf8_chr(0x0031)) !== false)
  128. {
  129. // any kind of letter L?
  130. if (strpos($value[0], 'LETTER L') !== false || strpos($value[0], 'IOTA') !== false || strpos($value[0], 'SMALL L ') !== false || preg_match('/SMALL LIGATURE [^L]*L /', $value[0]))
  131. {
  132. // replace all of the mappings that transform some sort of letter l to number one instead to some sort of letter l to latin small letter l
  133. $temp_hold = str_replace(utf8_chr(0x0031), utf8_chr(0x006C), $temp_hold);
  134. }
  135. }
  136. // uppercased chars that were folded do not exist in this universe,
  137. // no amount of normalization could ever "trick" this into not working
  138. if (in_array($value[1], $casefold_array[1]))
  139. {
  140. continue;
  141. }
  142. $uniarray[utf8_chr(hexdec((string)$value[1]))] = $temp_hold;
  143. }
  144. echo "Writing to confusables.$phpEx\n";
  145. $fp = fopen($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx, 'wb');
  146. fwrite($fp, '<?php return ' . my_var_export($uniarray) . ';');
  147. fclose($fp);
  148. /**
  149. * Return a parsable string representation of a variable
  150. *
  151. * This is function is limited to array/strings/integers
  152. *
  153. * @param mixed $var Variable
  154. * @return string PHP code representing the variable
  155. */
  156. function my_var_export($var)
  157. {
  158. if (is_array($var))
  159. {
  160. $lines = array();
  161. foreach ($var as $k => $v)
  162. {
  163. $lines[] = my_var_export($k) . '=>' . my_var_export($v);
  164. }
  165. return 'array(' . implode(',', $lines) . ')';
  166. }
  167. else if (is_string($var))
  168. {
  169. return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";
  170. }
  171. else
  172. {
  173. return $var;
  174. }
  175. }
  176. /**
  177. * Download a file to the develop/ dir
  178. *
  179. * @param string $url URL of the file to download
  180. * @return null
  181. */
  182. function download($url)
  183. {
  184. global $phpbb_root_path;
  185. if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
  186. {
  187. return;
  188. }
  189. echo 'Downloading from ', $url, ' ';
  190. if (!$fpr = fopen($url, 'rb'))
  191. {
  192. die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
  193. }
  194. if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
  195. {
  196. die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
  197. }
  198. $i = 0;
  199. $chunk = 32768;
  200. $done = '';
  201. while (!feof($fpr))
  202. {
  203. $i += fwrite($fpw, fread($fpr, $chunk));
  204. echo str_repeat("\x08", strlen($done));
  205. $done = ($i >> 10) . ' KiB';
  206. echo $done;
  207. }
  208. fclose($fpr);
  209. fclose($fpw);
  210. echo "\n";
  211. }