PageRenderTime 49ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/patchwork/utf8/class/Patchwork/Utf8.php

https://gitlab.com/MualnuamSolutions/book-library
PHP | 613 lines | 502 code | 86 blank | 25 comment | 77 complexity | a463809cd50bcd1a190fa874a23551b8 MD5 | raw file
Possible License(s): LGPL-3.0, LGPL-2.1, GPL-3.0, MIT, BSD-3-Clause
  1. <?php // vi: set fenc=utf-8 ts=4 sw=4 et:
  2. /*
  3. * Copyright (C) 2013 Nicolas Grekas - p@tchwork.com
  4. *
  5. * This library is free software; you can redistribute it and/or modify it
  6. * under the terms of the (at your option):
  7. * Apache License v2.0 (http://apache.org/licenses/LICENSE-2.0.txt), or
  8. * GNU General Public License v2.0 (http://gnu.org/licenses/gpl-2.0.txt).
  9. */
  10. namespace Patchwork;
  11. use Normalizer as n;
  12. /**
  13. * UTF-8 Grapheme Cluster aware string manipulations implementing the quasi complete
  14. * set of native PHP string functions that need UTF-8 awareness and more.
  15. * Missing are printf-family functions.
  16. */
  17. class Utf8
  18. {
  19. protected static
  20. $commonCaseFold = array(
  21. array('µ','ſ',"\xCD\x85",'ς',"\xCF\x90","\xCF\x91","\xCF\x95","\xCF\x96","\xCF\xB0","\xCF\xB1","\xCF\xB5","\xE1\xBA\x9B","\xE1\xBE\xBE"),
  22. array('μ','s','ι', 'σ','β', 'θ', 'φ', 'π', 'κ', 'ρ', 'ε', "\xE1\xB9\xA1",'ι' )
  23. ),
  24. $cp1252 = array('€','‚','ƒ','„','…','†','‡','ˆ','‰','Š','‹','Œ','Ž','‘','’','“','”','•','–','—','˜','™','š','›','œ','ž','Ÿ'),
  25. $utf8 = array('€','‚','ƒ','„','…','†','‡','ˆ','‰','Š','‹','Œ','Ž','‘','’','“','”','•','–','—','˜','™','š','›','œ','ž','Ÿ');
  26. static function isUtf8($s)
  27. {
  28. return (bool) preg_match('//u', $s); // Since PHP 5.2.5, this also excludes invalid five and six bytes sequences
  29. }
  30. // Generic UTF-8 to ASCII transliteration
  31. static function toAscii($s, $subst_chr = '?')
  32. {
  33. if (preg_match("/[\x80-\xFF]/", $s))
  34. {
  35. static $translitExtra = array();
  36. $translitExtra or $translitExtra = static::getData('translit_extra');
  37. $s = n::normalize($s, n::NFKC);
  38. /**/ $glibc = 'glibc' === ICONV_IMPL;
  39. preg_match_all('/./u', $s, $s);
  40. foreach ($s[0] as &$c)
  41. {
  42. if (! isset($c[1])) continue;
  43. /**/ if ($glibc)
  44. /**/ {
  45. $t = iconv('UTF-8', 'ASCII//TRANSLIT', $c);
  46. /**/ }
  47. /**/ else
  48. /**/ {
  49. $t = iconv('UTF-8', 'ASCII//IGNORE//TRANSLIT', $c);
  50. if (! isset($t[0])) $t = '?';
  51. else if (isset($t[1])) $t = ltrim($t, '\'`"^~');
  52. /**/ }
  53. if ('?' === $t)
  54. {
  55. if (isset($translitExtra[$c]))
  56. {
  57. $t = $translitExtra[$c];
  58. }
  59. else
  60. {
  61. $t = n::normalize($c, n::NFD);
  62. if ($t[0] < "\x80") $t = $t[0];
  63. else $t = $subst_chr;
  64. }
  65. }
  66. $c = $t;
  67. }
  68. $s = implode('', $s[0]);
  69. }
  70. return $s;
  71. }
  72. static function filter($var, $normalization_form = 4 /* n::NFC */, $leading_combining = '◌')
  73. {
  74. switch (gettype($var))
  75. {
  76. case 'array':
  77. foreach ($var as $k => $v) $var[$k] = static::filter($v, $normalization_form, $leading_combining);
  78. break;
  79. case 'object':
  80. foreach ($var as $k => $v) $var->$k = static::filter($v, $normalization_form, $leading_combining);
  81. break;
  82. case 'string':
  83. if (false !== strpos($var, "\r"))
  84. {
  85. // Workaround https://bugs.php.net/65732
  86. $var = str_replace("\r\n", "\n", $var);
  87. $var = strtr($var, "\r", "\n");
  88. }
  89. if (preg_match('/[\x80-\xFF]/', $var))
  90. {
  91. if (n::isNormalized($var, $normalization_form)) $n = '';
  92. else
  93. {
  94. $n = n::normalize($var, $normalization_form);
  95. if (false === $n) $var = static::utf8_encode($var);
  96. else $var = $n;
  97. }
  98. if ($var[0] >= "\x80" && false !== $n && isset($leading_combining[0]) && preg_match('/^\p{Mn}/u', $var))
  99. {
  100. // Prevent leading combining chars
  101. // for NFC-safe concatenations.
  102. $var = $leading_combining . $var;
  103. }
  104. }
  105. break;
  106. }
  107. return $var;
  108. }
  109. // Unicode transformation for caseless matching
  110. // see http://unicode.org/reports/tr21/tr21-5.html
  111. static function strtocasefold($s, $full = true)
  112. {
  113. $s = str_replace(self::$commonCaseFold[0], self::$commonCaseFold[1], $s);
  114. if ($full)
  115. {
  116. static $fullCaseFold = false;
  117. $fullCaseFold || $fullCaseFold = static::getData('caseFolding_full');
  118. $s = str_replace($fullCaseFold[0], $fullCaseFold[1], $s);
  119. }
  120. return static::strtolower($s);
  121. }
  122. // Generic case sensitive collation support for self::strnatcmp()
  123. static function strtonatfold($s)
  124. {
  125. $s = n::normalize($s, n::NFD);
  126. return preg_replace('/\p{Mn}+/u', '', $s);
  127. }
  128. // PHP string functions that need UTF-8 awareness
  129. static function filter_input($type, $var, $filter = FILTER_DEFAULT, $option = null)
  130. {
  131. if (4 > func_num_args()) $var = filter_input($type, $var, $filter);
  132. else $var = filter_input($type, $var, $filter, $option);
  133. return static::filter($var);
  134. }
  135. static function filter_input_array($type, $def = null, $add_empty = true)
  136. {
  137. if (2 > func_num_args()) $a = filter_input_array($type);
  138. else $a = filter_input_array($type, $def, $add_empty);
  139. return static::filter($a);
  140. }
  141. static function json_decode($json, $assoc = false, $depth = 512, $options = 0)
  142. {
  143. /**/ if (PHP_VERSION_ID < 50400)
  144. /**/ {
  145. $json = json_decode($json, $assoc, $depth);
  146. /**/ }
  147. /**/ else
  148. /**/ {
  149. $json = json_decode($json, $assoc, $depth, $options);
  150. /**/ }
  151. return static::filter($json);
  152. }
  153. static function substr($s, $start, $len = 2147483647)
  154. {
  155. /**/ static $bug62759;
  156. /**/ isset($bug62759) or $bug62759 = extension_loaded('intl') && 'à' === grapheme_substr('éà', 1, -2);
  157. /**/ if ($bug62759)
  158. /**/ {
  159. return PHP\Shim\Intl::grapheme_substr_workaround62759($s, $start, $len);
  160. /**/ }
  161. /**/ else
  162. /**/ {
  163. return grapheme_substr($s, $start, $len);
  164. /**/ }
  165. }
  166. static function strlen($s) {return grapheme_strlen($s);}
  167. static function strpos ($s, $needle, $offset = 0) {return grapheme_strpos ($s, $needle, $offset);}
  168. static function strrpos($s, $needle, $offset = 0) {return grapheme_strrpos($s, $needle, $offset);}
  169. static function stripos($s, $needle, $offset = 0)
  170. {
  171. /**/ if (50418 > PHP_VERSION_ID || 50500 == PHP_VERSION_ID)
  172. /**/ {
  173. // Don't use grapheme_stripos because of https://bugs.php.net/61860
  174. if (! preg_match('//u', $s .= '')) return false;
  175. if ($offset < 0) $offset = 0;
  176. if (! $needle = mb_stripos($s, $needle .= '', $offset, 'UTF-8')) return $needle;
  177. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  178. /**/ }
  179. /**/ else
  180. /**/ {
  181. return grapheme_stripos($s, $needle, $offset);
  182. /**/ }
  183. }
  184. static function strripos($s, $needle, $offset = 0)
  185. {
  186. /**/ if (50418 > PHP_VERSION_ID || 50500 == PHP_VERSION_ID)
  187. /**/ {
  188. // Don't use grapheme_strripos because of https://bugs.php.net/61860
  189. if (! preg_match('//u', $s .= '')) return false;
  190. if ($offset < 0) $offset = 0;
  191. if (! $needle = mb_strripos($s, $needle .= '', $offset, 'UTF-8')) return $needle;
  192. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  193. /**/ }
  194. /**/ else
  195. /**/ {
  196. return grapheme_strripos($s, $needle, $offset);
  197. /**/ }
  198. }
  199. static function stristr($s, $needle, $before_needle = false)
  200. {
  201. if ('' === $needle .= '') return false;
  202. return mb_stristr($s, $needle, $before_needle, 'UTF-8');
  203. }
  204. static function strstr ($s, $needle, $before_needle = false) {return grapheme_strstr($s, $needle, $before_needle);}
  205. static function strrchr ($s, $needle, $before_needle = false) {return mb_strrchr ($s, $needle, $before_needle, 'UTF-8');}
  206. static function strrichr($s, $needle, $before_needle = false) {return mb_strrichr($s, $needle, $before_needle, 'UTF-8');}
  207. static function strtolower($s) {return mb_strtolower($s, 'UTF-8');}
  208. static function strtoupper($s) {return mb_strtoupper($s, 'UTF-8');}
  209. static function wordwrap($s, $width = 75, $break = "\n", $cut = false)
  210. {
  211. // This implementation could be extended to handle unicode word boundaries,
  212. // but that's enough work for today (see http://www.unicode.org/reports/tr29/)
  213. $width = (int) $width;
  214. $s = explode($break, $s);
  215. $iLen = count($s);
  216. $result = array();
  217. $line = '';
  218. $lineLen = 0;
  219. for ($i = 0; $i < $iLen; ++$i)
  220. {
  221. $words = explode(' ', $s[$i]);
  222. $line && $result[] = $line;
  223. $lineLen = grapheme_strlen($line);
  224. $jLen = count($words);
  225. for ($j = 0; $j < $jLen; ++$j)
  226. {
  227. $w = $words[$j];
  228. $wLen = grapheme_strlen($w);
  229. if ($lineLen + $wLen < $width)
  230. {
  231. if ($j) $line .= ' ';
  232. $line .= $w;
  233. $lineLen += $wLen + 1;
  234. }
  235. else
  236. {
  237. if ($j || $i) $result[] = $line;
  238. $line = '';
  239. $lineLen = 0;
  240. if ($cut && $wLen > $width)
  241. {
  242. $w = self::str_split($w);
  243. do
  244. {
  245. $result[] = implode('', array_slice($w, 0, $width));
  246. $line = implode('', $w = array_slice($w, $width));
  247. $lineLen = $wLen -= $width;
  248. }
  249. while ($wLen > $width);
  250. $w = implode('', $w);
  251. }
  252. $line = $w;
  253. $lineLen = $wLen;
  254. }
  255. }
  256. }
  257. $line && $result[] = $line;
  258. return implode($break, $result);
  259. }
  260. static function chr($c)
  261. {
  262. if (0x80 > $c %= 0x200000) return chr($c);
  263. if (0x800 > $c) return chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F);
  264. if (0x10000 > $c) return chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
  265. return chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
  266. }
  267. static function count_chars($s, $mode = 0)
  268. {
  269. if (1 != $mode) user_error(__METHOD__ . '(): the only allowed $mode is 1', E_USER_WARNING);
  270. $s = self::str_split($s);
  271. return array_count_values($s);
  272. }
  273. static function ltrim($s, $charlist = INF)
  274. {
  275. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  276. return preg_replace("/^{$charlist}+/u", '', $s);
  277. }
  278. static function ord($s)
  279. {
  280. $a = ($s = unpack('C*', substr($s, 0, 4))) ? $s[1] : 0;
  281. if (0xF0 <= $a) return (($a - 0xF0)<<18) + (($s[2] - 0x80)<<12) + (($s[3] - 0x80)<<6) + $s[4] - 0x80;
  282. if (0xE0 <= $a) return (($a - 0xE0)<<12) + (($s[2] - 0x80)<<6) + $s[3] - 0x80;
  283. if (0xC0 <= $a) return (($a - 0xC0)<<6) + $s[2] - 0x80;
  284. return $a;
  285. }
  286. static function rtrim($s, $charlist = INF)
  287. {
  288. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  289. return preg_replace("/{$charlist}+$/u", '', $s);
  290. }
  291. static function trim($s, $charlist = INF) {return self::rtrim(self::ltrim($s, $charlist), $charlist);}
  292. static function str_ireplace($search, $replace, $subject, &$count = null)
  293. {
  294. $search = (array) $search;
  295. foreach ($search as $i => $s)
  296. {
  297. if ('' === $s .= '') $s = '/^(?<=.)$/';
  298. else $s = '/' . preg_quote($s, '/') . '/ui';
  299. $search[$i] = $s;
  300. }
  301. $subject = preg_replace($search, $replace, $subject, -1, $replace);
  302. $count = $replace;
  303. return $subject;
  304. }
  305. static function str_pad($s, $len, $pad = ' ', $type = STR_PAD_RIGHT)
  306. {
  307. $slen = grapheme_strlen($s);
  308. if ($len <= $slen) return $s;
  309. $padlen = grapheme_strlen($pad);
  310. $freelen = $len - $slen;
  311. $len = $freelen % $padlen;
  312. if (STR_PAD_RIGHT == $type) return $s . str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  313. if (STR_PAD_LEFT == $type) return str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  314. if (STR_PAD_BOTH == $type)
  315. {
  316. $freelen /= 2;
  317. $type = ceil($freelen);
  318. $len = $type % $padlen;
  319. $s .= str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  320. $type = floor($freelen);
  321. $len = $type % $padlen;
  322. return str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  323. }
  324. user_error(__METHOD__ . '(): Padding type has to be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH', E_USER_WARNING);
  325. }
  326. static function str_shuffle($s)
  327. {
  328. $s = self::str_split($s);
  329. shuffle($s);
  330. return implode('', $s);
  331. }
  332. static function str_split($s, $len = 1)
  333. {
  334. if (1 > $len = (int) $len)
  335. {
  336. $len = func_get_arg(1);
  337. return str_split($s, $len);
  338. }
  339. /**/ if (extension_loaded('intl'))
  340. /**/ {
  341. $a = array();
  342. $p = 0;
  343. $l = strlen($s);
  344. while ($p < $l) $a[] = grapheme_extract($s, 1, GRAPHEME_EXTR_COUNT, $p, $p);
  345. /**/ }
  346. /**/ else
  347. /**/ {
  348. preg_match_all('/' . GRAPHEME_CLUSTER_RX . '/u', $s, $a);
  349. $a = $a[0];
  350. /**/ }
  351. if (1 == $len) return $a;
  352. $s = array();
  353. $p = -1;
  354. foreach ($a as $l => $a)
  355. {
  356. if ($l % $len) $s[$p] .= $a;
  357. else $s[++$p] = $a;
  358. }
  359. return $s;
  360. }
  361. static function str_word_count($s, $format = 0, $charlist = '')
  362. {
  363. $charlist = self::rxClass($charlist, '\pL');
  364. $s = preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $s, -1, PREG_SPLIT_DELIM_CAPTURE);
  365. $charlist = array();
  366. $len = count($s);
  367. if (1 == $format) for ($i = 1; $i < $len; $i+=2) $charlist[] = $s[$i];
  368. else if (2 == $format)
  369. {
  370. $offset = grapheme_strlen($s[0]);
  371. for ($i = 1; $i < $len; $i+=2)
  372. {
  373. $charlist[$offset] = $s[$i];
  374. $offset += grapheme_strlen($s[$i]) + grapheme_strlen($s[$i+1]);
  375. }
  376. }
  377. else $charlist = ($len - 1) / 2;
  378. return $charlist;
  379. }
  380. static function strcmp ($a, $b) {return $a . '' === $b . '' ? 0 : strcmp(n::normalize($a, n::NFD), n::normalize($b, n::NFD));}
  381. static function strnatcmp ($a, $b) {return $a . '' === $b . '' ? 0 : strnatcmp(self::strtonatfold($a), self::strtonatfold($b));}
  382. static function strcasecmp ($a, $b) {return self::strcmp (static::strtocasefold($a), static::strtocasefold($b));}
  383. static function strnatcasecmp($a, $b) {return self::strnatcmp(static::strtocasefold($a), static::strtocasefold($b));}
  384. static function strncasecmp ($a, $b, $len) {return self::strncmp(static::strtocasefold($a), static::strtocasefold($b), $len);}
  385. static function strncmp ($a, $b, $len) {return self::strcmp(self::substr($a, 0, $len), self::substr($b, 0, $len));}
  386. static function strcspn($s, $charlist, $start = 0, $len = 2147483647)
  387. {
  388. if ('' === $charlist .= '') return null;
  389. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  390. return preg_match('/^(.*?)' . self::rxClass($charlist) . '/us', $s, $len) ? grapheme_strlen($len[1]) : grapheme_strlen($s);
  391. }
  392. static function strpbrk($s, $charlist)
  393. {
  394. if (preg_match('/' . self::rxClass($charlist) . '/us', $s, $m)) return substr($s, strpos($s, $m[0]));
  395. else return false;
  396. }
  397. static function strrev($s)
  398. {
  399. $s = self::str_split($s);
  400. return implode('', array_reverse($s));
  401. }
  402. static function strspn($s, $mask, $start = 0, $len = 2147483647)
  403. {
  404. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  405. return preg_match('/^' . self::rxClass($mask) . '+/u', $s, $s) ? grapheme_strlen($s[0]) : 0;
  406. }
  407. static function strtr($s, $from, $to = INF)
  408. {
  409. if (INF !== $to)
  410. {
  411. $from = self::str_split($from);
  412. $to = self::str_split($to);
  413. $a = count($from);
  414. $b = count($to);
  415. if ($a > $b) $from = array_slice($from, 0, $b);
  416. else if ($a < $b) $to = array_slice($to , 0, $a);
  417. $from = array_combine($from, $to);
  418. }
  419. return strtr($s, $from);
  420. }
  421. static function substr_compare($a, $b, $offset, $len = 2147483647, $i = 0)
  422. {
  423. $a = self::substr($a, $offset, $len);
  424. return $i ? static::strcasecmp($a, $b) : self::strcmp($a, $b);
  425. }
  426. static function substr_count($s, $needle, $offset = 0, $len = 2147483647)
  427. {
  428. return substr_count(self::substr($s, $offset, $len), $needle);
  429. }
  430. static function substr_replace($s, $replace, $start, $len = 2147483647)
  431. {
  432. $s = self::str_split($s);
  433. $replace = self::str_split($replace);
  434. array_splice($s, $start, $len, $replace);
  435. return implode('', $s);
  436. }
  437. static function ucfirst($s)
  438. {
  439. $c = iconv_substr($s, 0, 1, 'UTF-8');
  440. return static::ucwords($c) . substr($s, strlen($c));
  441. }
  442. static function lcfirst($s)
  443. {
  444. $c = iconv_substr($s, 0, 1, 'UTF-8');
  445. return static::strtolower($c) . substr($s, strlen($c));
  446. }
  447. static function ucwords($s)
  448. {
  449. return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8');
  450. }
  451. static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
  452. {
  453. /**/ if (PHP_VERSION_ID < 50400)
  454. /**/ {
  455. if (isset($thousands_sep[1]) || isset($dec_point[1]))
  456. {
  457. return str_replace(
  458. array('.', ','),
  459. array($dec_point, $thousands_sep),
  460. number_format($number, $decimals, '.', ',')
  461. );
  462. }
  463. /**/ }
  464. return number_format($number, $decimals, $dec_point, $thousands_sep);
  465. }
  466. static function utf8_encode($s)
  467. {
  468. $s = utf8_encode($s);
  469. if (false === strpos($s, "\xC2")) return $s;
  470. else return str_replace(self::$cp1252, self::$utf8, $s);
  471. }
  472. static function utf8_decode($s)
  473. {
  474. $s = str_replace(self::$utf8, self::$cp1252, $s);
  475. return utf8_decode($s);
  476. }
  477. protected static function rxClass($s, $class = '')
  478. {
  479. $class = array($class);
  480. foreach (self::str_split($s) as $s)
  481. {
  482. if ('-' === $s) $class[0] = '-' . $class[0];
  483. else if (!isset($s[2])) $class[0] .= preg_quote($s, '/');
  484. else if (1 === iconv_strlen($s, 'UTF-8')) $class[0] .= $s;
  485. else $class[] = $s;
  486. }
  487. $class[0] = '[' . $class[0] . ']';
  488. if (1 === count($class)) return $class[0];
  489. else return '(?:' . implode('|', $class) . ')';
  490. }
  491. protected static function getData($file)
  492. {
  493. $file = __DIR__ . '/Utf8/data/' . $file . '.ser';
  494. if (file_exists($file)) return unserialize(file_get_contents($file));
  495. else return false;
  496. }
  497. }