PageRenderTime 43ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/phpBB/develop/generate_utf_confusables.php

https://github.com/naderman/phpbb-orchestra
PHP | 244 lines | 180 code | 26 blank | 38 comment | 23 complexity | 19740703a1e9aff52cdd1d4663940d11 MD5 | raw file
  1. <?php
  2. /**
  3. *
  4. * @package phpBB3
  5. * @version $Id$
  6. * @copyright (c) 2005 phpBB Group
  7. * @license http://opensource.org/licenses/gpl-license.php GNU Public License
  8. *
  9. */
  10. if (php_sapi_name() != 'cli')
  11. {
  12. die("This program must be run from the command line.\n");
  13. }
  14. //
  15. // Security message:
  16. //
  17. // This script is potentially dangerous.
  18. // Remove or comment the next line (die(".... ) to enable this script.
  19. // Do NOT FORGET to either remove this script or disable it after you have used it.
  20. //
  21. die("Please read the first lines of this script for instructions on how to enable it");
  22. set_time_limit(0);
  23. define('IN_PHPBB', true);
  24. $phpbb_root_path = '../';
  25. $phpEx = substr(strrchr(__FILE__, '.'), 1);
  26. echo "Checking for required files\n";
  27. download('http://unicode.org/reports/tr39/data/confusables.txt');
  28. download('http://unicode.org/Public/UNIDATA/CaseFolding.txt');
  29. echo "\n";
  30. /**
  31. * Load the confusables table
  32. */
  33. echo "Loading confusables\n";
  34. $unidata = file_get_contents('confusables.txt');
  35. /**
  36. * Load the CaseFolding table
  37. */
  38. echo "Loading CaseFolding\n";
  39. $casefolds = file_get_contents('CaseFolding.txt');
  40. function utf8_chr($cp)
  41. {
  42. if ($cp > 0xFFFF)
  43. {
  44. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  45. }
  46. else if ($cp > 0x7FF)
  47. {
  48. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  49. }
  50. else if ($cp > 0x7F)
  51. {
  52. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  53. }
  54. else
  55. {
  56. return chr($cp);
  57. }
  58. }
  59. preg_match_all('/^([0-9A-F]+) ;\s((?:[0-9A-F]+ )*);.*?$/im', $unidata, $array, PREG_SET_ORDER);
  60. preg_match_all('/^([0-9A-F]+); ([CFS]); ([0-9A-F]+(?: [0-9A-F]+)*);/im', $casefolds, $casefold_array);
  61. // some that we defined ourselves
  62. $uniarray = array(
  63. "\xC2\xA1" => "\x69", // EXCLAMATION MARK, INVERTED => LATIN SMALL LETTER I
  64. "\xC7\x83" => "\x21", // LATIN LETTER RETROFLEX CLICK => EXCLAMATION MARK
  65. "\xCE\xB1" => "\x61", // GREEK SMALL LETTER ALPHA => LATIN SMALL LETTER A
  66. "\xE1\x9A\x80" => "\x20", // OGHAM SPACE MARK
  67. "\xC2\xAD" => '', // HYPHEN, SOFT => empty string
  68. "\xDB\x9D" => '', // ARABIC END OF AYAH
  69. "\xDC\x8F" => '', // SYRIAC ABBREVIATION MARK
  70. "\xE1\xA0\x86" => '', // MONGOLIAN TODO SOFT HYPHEN
  71. "\xE1\xA0\x8E" => '', // MONGOLIAN VOWEL SEPARATOR
  72. "\xE2\x80\x8B" => '', // ZERO WIDTH SPACE
  73. "\xE2\x80\x8C" => '', // ZERO WIDTH NON-JOINER
  74. "\xE2\x80\x8D" => '', // ZERO WIDTH JOINER
  75. "\xE2\x80\xA8" => '', // LINE SEPARATOR
  76. "\xE2\x80\xA9" => '', // PARAGRAPH SEPARATOR
  77. "\xE2\x81\xA0" => '', // WORD JOINER
  78. "\xE2\x81\xA1" => '', // FUNCTION APPLICATION
  79. "\xE2\x81\xA2" => '', // INVISIBLE TIMES
  80. "\xE2\x81\xA3" => '', // INVISIBLE SEPARATOR
  81. "\xE2\x81\xAA" => '', // [CONTROL CHARACTERS]
  82. "\xE2\x81\xAB" => '', // [CONTROL CHARACTERS]
  83. "\xE2\x81\xAC" => '', // [CONTROL CHARACTERS]
  84. "\xE2\x81\xAD" => '', // [CONTROL CHARACTERS]
  85. "\xE2\x81\xAE" => '', // [CONTROL CHARACTERS]
  86. "\xE2\x81\xAF" => '', // [CONTROL CHARACTERS]
  87. "\xEF\xBB\xBF" => '', // ZERO WIDTH NO-BREAK SPACE
  88. "\xEF\xBF\xB9" => '', // [CONTROL CHARACTERS]
  89. "\xEF\xBF\xBA" => '', // [CONTROL CHARACTERS]
  90. "\xEF\xBF\xBB" => '', // [CONTROL CHARACTERS]
  91. "\xEF\xBF\xBC" => '', // [CONTROL CHARACTERS]
  92. "\xF0\x9D\x85\xB3" => '', // [MUSICAL CONTROL CHARACTERS]
  93. "\xF0\x9D\x85\xB4" => '', // [MUSICAL CONTROL CHARACTERS]
  94. "\xF0\x9D\x85\xB5" => '', // [MUSICAL CONTROL CHARACTERS]
  95. "\xF0\x9D\x85\xB6" => '', // [MUSICAL CONTROL CHARACTERS]
  96. "\xF0\x9D\x85\xB7" => '', // [MUSICAL CONTROL CHARACTERS]
  97. "\xF0\x9D\x85\xB8" => '', // [MUSICAL CONTROL CHARACTERS]
  98. "\xF0\x9D\x85\xB9" => '', // [MUSICAL CONTROL CHARACTERS]
  99. "\xF0\x9D\x85\xBA" => '', // [MUSICAL CONTROL CHARACTERS]
  100. );
  101. $copy = $uniarray;
  102. /**
  103. * @todo we need to check that the $uniarray does not reverse any of the mappings defined in the unicode definition
  104. */
  105. foreach ($array as $value)
  106. {
  107. $temp_hold = implode(array_map('utf8_chr', array_map('hexdec', explode(' ', trim($value[2])))));
  108. if (isset($copy[utf8_chr(hexdec((string)$value[1]))]))
  109. {
  110. $num = '';
  111. $string = utf8_chr(hexdec((string)$value[1]));
  112. for ($i = 0; $i < strlen($string); $i++)
  113. {
  114. $num .= '\x' . str_pad(base_convert(ord($string[$i]), 10, 16), 2, '0', STR_PAD_LEFT);
  115. }
  116. echo $num . "\n";
  117. if ($uniarray[$string] != $temp_hold)
  118. {
  119. echo " --> $string\n";
  120. echo " --> " . $temp_hold . "\n";
  121. }
  122. }
  123. // do some tests for things that transform into something with the number one
  124. if (strpos($temp_hold, utf8_chr(0x0031)) !== false)
  125. {
  126. // any kind of letter L?
  127. if (strpos($value[0], 'LETTER L') !== false || strpos($value[0], 'IOTA') !== false || strpos($value[0], 'SMALL L ') !== false || preg_match('/SMALL LIGATURE [^L]*L /', $value[0]))
  128. {
  129. // replace all of the mappings that transform some sort of letter l to number one instead to some sort of letter l to latin small letter l
  130. $temp_hold = str_replace(utf8_chr(0x0031), utf8_chr(0x006C), $temp_hold);
  131. }
  132. }
  133. // uppercased chars that were folded do not exist in this universe,
  134. // no amount of normalization could ever "trick" this into not working
  135. if (in_array($value[1], $casefold_array[1]))
  136. {
  137. continue;
  138. }
  139. $uniarray[utf8_chr(hexdec((string)$value[1]))] = $temp_hold;
  140. }
  141. echo "Writing to confusables.$phpEx\n";
  142. $fp = fopen($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx, 'wb');
  143. fwrite($fp, '<?php return ' . my_var_export($uniarray) . ';');
  144. fclose($fp);
  145. /**
  146. * Return a parsable string representation of a variable
  147. *
  148. * This is function is limited to array/strings/integers
  149. *
  150. * @param mixed $var Variable
  151. * @return string PHP code representing the variable
  152. */
  153. function my_var_export($var)
  154. {
  155. if (is_array($var))
  156. {
  157. $lines = array();
  158. foreach ($var as $k => $v)
  159. {
  160. $lines[] = my_var_export($k) . '=>' . my_var_export($v);
  161. }
  162. return 'array(' . implode(',', $lines) . ')';
  163. }
  164. else if (is_string($var))
  165. {
  166. return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";
  167. }
  168. else
  169. {
  170. return $var;
  171. }
  172. }
  173. /**
  174. * Download a file to the develop/ dir
  175. *
  176. * @param string $url URL of the file to download
  177. * @return void
  178. */
  179. function download($url)
  180. {
  181. global $phpbb_root_path;
  182. if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
  183. {
  184. return;
  185. }
  186. echo 'Downloading from ', $url, ' ';
  187. if (!$fpr = fopen($url, 'rb'))
  188. {
  189. die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
  190. }
  191. if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
  192. {
  193. die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
  194. }
  195. $i = 0;
  196. $chunk = 32768;
  197. $done = '';
  198. while (!feof($fpr))
  199. {
  200. $i += fwrite($fpw, fread($fpr, $chunk));
  201. echo str_repeat("\x08", strlen($done));
  202. $done = ($i >> 10) . ' KiB';
  203. echo $done;
  204. }
  205. fclose($fpr);
  206. fclose($fpw);
  207. echo "\n";
  208. }
  209. ?>