PageRenderTime 51ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/class/Patchwork/Utf8.php

http://github.com/nicolas-grekas/Patchwork-UTF8
PHP | 450 lines | 371 code | 59 blank | 20 comment | 42 complexity | a503c7d83aa33361be57a7bee64b8833 MD5 | raw file
  1. <?php // vi: set fenc=utf-8 ts=4 sw=4 et:
  2. /*
  3. * Copyright (C) 2012 Nicolas Grekas - p@tchwork.com
  4. *
  5. * This library is free software; you can redistribute it and/or modify it
  6. * under the terms of the (at your option):
  7. * Apache License v2.0 (http://apache.org/licenses/LICENSE-2.0.txt), or
  8. * GNU General Public License v2.0 (http://gnu.org/licenses/gpl-2.0.txt).
  9. */
  10. namespace Patchwork;
  11. use Normalizer as n;
  12. /**
  13. * UTF-8 Grapheme Cluster aware string manipulations implementing the quasi complete
  14. * set of native PHP string functions that need UTF-8 awareness and more.
  15. * Missing are printf-family functions and number_format.
  16. */
  17. class Utf8
  18. {
  19. protected static
  20. $commonCaseFold = array(
  21. array('ľ','?',"\xCD\x85",'?',"\xCF\x90","\xCF\x91","\xCF\x95","\xCF\x96","\xCF\xB0","\xCF\xB1","\xCF\xB5","\xE1\xBA\x9B","\xE1\xBE\xBE"),
  22. array('?','s','?', '?','?', '?', '?', '?', '?', '?', '?', "\xE1\xB9\xA1",'?' )
  23. );
  24. static function isUtf8($s)
  25. {
  26. return (bool) preg_match('//u', $s); // Since PHP 5.2.5, this also excludes invalid five and six bytes sequences
  27. }
  28. // Generic UTF-8 to ASCII transliteration
  29. static function toAscii($s)
  30. {
  31. if (preg_match("/[\x80-\xFF]/", $s))
  32. {
  33. $s = n::normalize($s, n::NFKD);
  34. $s = preg_replace('/\p{Mn}+/u', '', $s);
  35. $s = iconv('UTF-8', 'ASCII' . ('glibc' !== ICONV_IMPL ? '//IGNORE' : '') . '//TRANSLIT', $s);
  36. }
  37. return $s;
  38. }
  39. // Unicode transformation for caseless matching
  40. // see http://unicode.org/reports/tr21/tr21-5.html
  41. static function strtocasefold($s, $full = true, $turkish = false)
  42. {
  43. $s = str_replace(self::$commonCaseFold[0], self::$commonCaseFold[1], $s);
  44. if ($turkish)
  45. {
  46. false !== strpos($s, 'I') && $s = str_replace('I', '?', $s);
  47. $full && false !== strpos($s, '?') && $s = str_replace('?', 'i', $s);
  48. }
  49. if ($full)
  50. {
  51. static $fullCaseFold = false;
  52. $fullCaseFold || $fullCaseFold = self::getData('caseFolding_full');
  53. $s = str_replace($fullCaseFold[0], $fullCaseFold[1], $s);
  54. }
  55. return self::strtolower($s);
  56. }
  57. // Generic case sensitive collation support for self::strnatcmp()
  58. static function strtonatfold($s)
  59. {
  60. $s = n::normalize($s, n::NFD);
  61. return preg_replace('/\p{Mn}+/u', '', $s);
  62. }
  63. // PHP string functions that need UTF-8 awareness
  64. static function substr($s, $start, $len = 2147483647)
  65. {
  66. /**/ if (extension_loaded('intl') && PHP_VERSION_ID < 50400)
  67. /**/ {
  68. return PHP\Override\Intl::grapheme_substr_workaround55562($s, $start, $len);
  69. /**/ }
  70. /**/ else
  71. /**/ {
  72. return grapheme_substr($s, $start, $len);
  73. /**/ }
  74. }
  75. static function strlen($s) {return grapheme_strlen($s);}
  76. static function strpos ($s, $needle, $offset = 0) {return grapheme_strpos ($s, $needle, $offset);}
  77. static function strrpos ($s, $needle, $offset = 0) {return grapheme_strrpos ($s, $needle, $offset);}
  78. static function stripos ($s, $needle, $offset = 0)
  79. {
  80. if ($offset < 0) $offset = 0;
  81. if (!$needle = mb_stripos($s, $needle, $offset, 'UTF-8')) return $needle;
  82. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  83. }
  84. static function strripos($s, $needle, $offset = 0)
  85. {
  86. if ($offset < 0) $offset = 0;
  87. if (!$needle = mb_strripos($s, $needle, $offset, 'UTF-8')) return $needle;
  88. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  89. }
  90. static function stristr ($s, $needle, $before_needle = false)
  91. {
  92. if ('' == (string) $needle) return false;
  93. return mb_stristr($s, $needle, $before_needle, 'UTF-8');
  94. }
  95. static function strstr ($s, $needle, $before_needle = false) {return grapheme_strstr ($s, $needle, $before_needle);}
  96. static function strrchr ($s, $needle, $before_needle = false) {return mb_strrchr ($s, $needle, $before_needle, 'UTF-8');}
  97. static function strrichr($s, $needle, $before_needle = false) {return mb_strrichr($s, $needle, $before_needle, 'UTF-8');}
  98. static function strtolower($s, $form = n::NFC) {if (n::isNormalized($s = mb_strtolower($s, 'UTF-8'), $form)) return $s; return n::normalize($s, $form);}
  99. static function strtoupper($s, $form = n::NFC) {if (n::isNormalized($s = mb_strtoupper($s, 'UTF-8'), $form)) return $s; return n::normalize($s, $form);}
  100. static function wordwrap($s, $width = 75, $break = "\n", $cut = false)
  101. {
  102. // This implementation could be extended to handle unicode word boundaries,
  103. // but that's enough work for today (see http://www.unicode.org/reports/tr29/)
  104. $width = (int) $width;
  105. $s = explode($break, $s);
  106. $iLen = count($s);
  107. $result = array();
  108. $line = '';
  109. $lineLen = 0;
  110. for ($i = 0; $i < $iLen; ++$i)
  111. {
  112. $words = explode(' ', $s[$i]);
  113. $line && $result[] = $line;
  114. $lineLen = grapheme_strlen($line);
  115. $jLen = count($words);
  116. for ($j = 0; $j < $jLen; ++$j)
  117. {
  118. $w = $words[$j];
  119. $wLen = grapheme_strlen($w);
  120. if ($lineLen + $wLen < $width)
  121. {
  122. if ($j) $line .= ' ';
  123. $line .= $w;
  124. $lineLen += $wLen + 1;
  125. }
  126. else
  127. {
  128. if ($j || $i) $result[] = $line;
  129. $line = '';
  130. $lineLen = 0;
  131. if ($cut && $wLen > $width)
  132. {
  133. $w = self::getGraphemeClusters($w);
  134. do
  135. {
  136. $result[] = implode('', array_slice($w, 0, $width));
  137. $line = implode('', $w = array_slice($w, $width));
  138. $lineLen = $wLen -= $width;
  139. }
  140. while ($wLen > $width);
  141. $w = implode('', $w);
  142. }
  143. $line = $w;
  144. $lineLen = $wLen;
  145. }
  146. }
  147. }
  148. $line && $result[] = $line;
  149. return implode($break, $result);
  150. }
  151. static function chr($c)
  152. {
  153. $c %= 0x200000;
  154. return $c < 0x80 ? chr($c) : (
  155. $c < 0x800 ? chr(0xC0 | $c>> 6) . chr(0x80 | $c & 0x3F) : (
  156. $c < 0x10000 ? chr(0xE0 | $c>>12) . chr(0x80 | $c>> 6 & 0x3F) . chr(0x80 | $c & 0x3F) : (
  157. chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F)
  158. )));
  159. }
  160. static function count_chars($s, $mode = 0)
  161. {
  162. if (1 != $mode) user_error(__METHOD__ . '(): the only allowed $mode is 1', E_USER_WARNING);
  163. $s = self::getGraphemeClusters($s);
  164. return array_count_values($s);
  165. }
  166. static function ltrim($s, $charlist = INF)
  167. {
  168. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  169. return preg_replace("/^{$charlist}+/u", '', $s);
  170. }
  171. static function ord($s)
  172. {
  173. $s = unpack('C*', substr($s, 0, 6));
  174. $a = $s ? $s[1] : 0;
  175. return 240 <= $a && $a <= 255 ? (($a-240) << 18) + (($s[2]-128) << 12) + (($s[3]-128) << 6) + $s[4]-128 : (
  176. 224 <= $a && $a <= 239 ? (($a-224) << 12) + (($s[2]-128) << 6) + $s[3]-128 : (
  177. 192 <= $a && $a <= 223 ? (($a-192) << 6) + $s[2]-128 : (
  178. $a)));
  179. }
  180. static function rtrim($s, $charlist = INF)
  181. {
  182. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  183. return preg_replace("/{$charlist}+$/u", '', $s);
  184. }
  185. static function trim($s, $charlist = INF) {return self::rtrim(self::ltrim($s, $charlist), $charlist);}
  186. static function str_ireplace($search, $replace, $subject, &$count = null)
  187. {
  188. $search = (array) $search;
  189. foreach ($search as &$s) $s = '' !== (string) $s ? '/' . preg_quote($s, '/') . '/ui' : '/^(?<=.)$/';
  190. $subject = preg_replace($search, $replace, $subject, -1, $replace);
  191. $count = $replace;
  192. return $subject;
  193. }
  194. static function str_pad($s, $len, $pad = ' ', $type = STR_PAD_RIGHT)
  195. {
  196. $slen = grapheme_strlen($s);
  197. if ($len <= $slen) return $s;
  198. $padlen = grapheme_strlen($pad);
  199. $freelen = $len - $slen;
  200. $len = $freelen % $padlen;
  201. if (STR_PAD_RIGHT == $type) return $s . str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  202. if (STR_PAD_LEFT == $type) return str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  203. if (STR_PAD_BOTH == $type)
  204. {
  205. $freelen /= 2;
  206. $type = ceil($freelen);
  207. $len = $type % $padlen;
  208. $s .= str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  209. $type = floor($freelen);
  210. $len = $type % $padlen;
  211. return str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  212. }
  213. user_error(__METHOD__ . '(): Padding type has to be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH.');
  214. }
  215. static function str_shuffle($s)
  216. {
  217. $s = self::getGraphemeClusters($s);
  218. shuffle($s);
  219. return implode('', $s);
  220. }
  221. static function str_split($s, $len = 1)
  222. {
  223. $len = (int) $len;
  224. if ($len < 1) return str_split($s, $len);
  225. $s = self::getGraphemeClusters($s);
  226. if (1 === $len) return $s;
  227. $a = array();
  228. $j = -1;
  229. foreach ($s as $i => $s)
  230. {
  231. if ($i % $len) $a[$j] .= $s;
  232. else $a[++$j] = $s;
  233. }
  234. return $a;
  235. }
  236. static function str_word_count($s, $format = 0, $charlist = '')
  237. {
  238. $charlist = self::rxClass($charlist, '\pL');
  239. $s = preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $s, -1, PREG_SPLIT_DELIM_CAPTURE);
  240. $charlist = array();
  241. $len = count($s);
  242. if (1 == $format) for ($i = 1; $i < $len; $i+=2) $charlist[] = $s[$i];
  243. else if (2 == $format)
  244. {
  245. $offset = grapheme_strlen($s[0]);
  246. for ($i = 1; $i < $len; $i+=2)
  247. {
  248. $charlist[$offset] = $s[$i];
  249. $offset += grapheme_strlen($s[$i]) + grapheme_strlen($s[$i+1]);
  250. }
  251. }
  252. else $charlist = ($len - 1) / 2;
  253. return $charlist;
  254. }
  255. static function strcmp ($a, $b) {return (string) $a === (string) $b ? 0 : strcmp(n::normalize($a, n::NFD), n::normalize($b, n::NFD));}
  256. static function strnatcmp ($a, $b) {return (string) $a === (string) $b ? 0 : strnatcmp(self::strtonatfold($a), self::strtonatfold($b));}
  257. static function strcasecmp ($a, $b) {return self::strcmp (self::strtocasefold($a), self::strtocasefold($b));}
  258. static function strnatcasecmp($a, $b) {return self::strnatcmp(self::strtocasefold($a), self::strtocasefold($b));}
  259. static function strncasecmp ($a, $b, $len) {return self::strncmp(self::strtocasefold($a), self::strtocasefold($b), $len);}
  260. static function strncmp ($a, $b, $len) {return self::strcmp(self::substr($a, 0, $len), self::substr($b, 0, $len));}
  261. static function strcspn($s, $charlist, $start = 0, $len = 2147483647)
  262. {
  263. if ('' === (string) $charlist) return null;
  264. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  265. return preg_match('/^(.*?)' . self::rxClass($charlist) . '/us', $s, $len) ? grapheme_strlen($len[1]) : grapheme_strlen($s);
  266. }
  267. static function strpbrk($s, $charlist)
  268. {
  269. if (preg_match('/' . self::rxClass($charlist) . '/us', $s, $m)) return substr($s, strpos($s, $m[0]));
  270. else return false;
  271. }
  272. static function strrev($s)
  273. {
  274. $s = self::getGraphemeClusters($s);
  275. return implode('', array_reverse($s));
  276. }
  277. static function strspn($s, $mask, $start = 0, $len = 2147483647)
  278. {
  279. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  280. return preg_match('/^' . self::rxClass($mask) . '+/u', $s, $s) ? grapheme_strlen($s[0]) : 0;
  281. }
  282. static function strtr($s, $from, $to = INF)
  283. {
  284. if (INF !== $to)
  285. {
  286. $from = self::getGraphemeClusters($from);
  287. $to = self::getGraphemeClusters($to);
  288. $a = count($from);
  289. $b = count($to);
  290. if ($a > $b) $from = array_slice($from, 0, $b);
  291. else if ($a < $b) $to = array_slice($to , 0, $a);
  292. $from = array_combine($from, $to);
  293. }
  294. return strtr($s, $from);
  295. }
  296. static function substr_compare($a, $b, $offset, $len = 2147483647, $i = 0)
  297. {
  298. $a = self::substr($a, $offset, $len);
  299. return $i ? self::strcasecmp($a, $b) : self::strcmp($a, $b);
  300. }
  301. static function substr_count($s, $needle, $offset = 0, $len = 2147483647)
  302. {
  303. return substr_count(self::substr($s, $offset, $len), $needle);
  304. }
  305. static function substr_replace($s, $replace, $start, $len = 2147483647)
  306. {
  307. $s = self::getGraphemeClusters($s);
  308. $replace = self::getGraphemeClusters($replace);
  309. array_splice($s, $start, $len, $replace);
  310. return implode('', $s);
  311. }
  312. static function ucfirst($s)
  313. {
  314. $c = iconv_substr($s, 0, 1, 'UTF-8');
  315. return self::ucwords($c) . substr($s, strlen($c));
  316. }
  317. static function lcfirst($s)
  318. {
  319. $c = iconv_substr($s, 0, 1, 'UTF-8');
  320. return mb_strtolower($c, 'UTF-8') . substr($s, strlen($c));
  321. }
  322. static function ucwords($s)
  323. {
  324. return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8');
  325. }
  326. static function getGraphemeClusters($s)
  327. {
  328. /**/ if (extension_loaded('intl'))
  329. /**/ {
  330. $gca = array();
  331. $pos = 0;
  332. $len = strlen($s);
  333. while ($pos < $len) $gca[] = grapheme_extract($s, 1, GRAPHEME_EXTR_COUNT, $pos, $pos);
  334. return $gca;
  335. /**/ }
  336. /**/ else
  337. /**/ {
  338. preg_match_all('/' . PHP\Override\Intl::GRAPHEME_CLUSTER_RX . '/u', $s, $s);
  339. return $s[0];
  340. /**/ }
  341. }
  342. protected static function rxClass($s, $class = '')
  343. {
  344. $class = array($class);
  345. foreach (self::getGraphemeClusters($s) as $s)
  346. {
  347. if ('-' === $s) $class[0] = '-' . $class[0];
  348. else if (!isset($s[2])) $class[0] .= preg_quote($s, '/');
  349. else if (1 === iconv_strlen($s, 'UTF-8')) $class[0] .= $s;
  350. else $class[] = $s;
  351. }
  352. $class[0] = '[' . $class[0] . ']';
  353. if (1 === count($class)) return $class[0];
  354. else return '(?:' . implode('|', $class) . ')';
  355. }
  356. protected static function getData($file)
  357. {
  358. $file = __DIR__ . '/Utf8/data/' . $file . '.ser';
  359. if (file_exists($file)) return unserialize(file_get_contents($file));
  360. else return false;
  361. }
  362. }