PageRenderTime 44ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/class/Patchwork/PHP/Override/Mbstring.php

http://github.com/nicolas-grekas/Patchwork-UTF8
PHP | 336 lines | 223 code | 54 blank | 59 comment | 53 complexity | e44eb2939fcc4001bf1313d7b50e869a MD5 | raw file
  1. <?php // vi: set fenc=utf-8 ts=4 sw=4 et:
  2. /*
  3. * Copyright (C) 2012 Nicolas Grekas - p@tchwork.com
  4. *
  5. * This library is free software; you can redistribute it and/or modify it
  6. * under the terms of the (at your option):
  7. * Apache License v2.0 (http://apache.org/licenses/LICENSE-2.0.txt), or
  8. * GNU General Public License v2.0 (http://gnu.org/licenses/gpl-2.0.txt).
  9. */
  10. namespace Patchwork\PHP\Override;
  11. /**
  12. * Partial mbstring implementation in PHP, iconv based, UTF-8 centric.
  13. *
  14. * Implemented:
  15. * - mb_convert_encoding - Convert character encoding
  16. * - mb_decode_mimeheader - Decode string in MIME header field
  17. * - mb_encode_mimeheader - Encode string for MIME header XXX NATIVE IMPLEMENTATION IS REALLY BUGGED
  18. * - mb_convert_case - Perform case folding on a string
  19. * - mb_internal_encoding - Set/Get internal character encoding
  20. * - mb_list_encodings - Returns an array of all supported encodings
  21. * - mb_strlen - Get string length
  22. * - mb_strpos - Find position of first occurrence of string in a string
  23. * - mb_strrpos - Find position of last occurrence of a string in a string
  24. * - mb_strtolower - Make a string lowercase
  25. * - mb_strtoupper - Make a string uppercase
  26. * - mb_substitute_character - Set/Get substitution character
  27. * - mb_substr - Get part of string
  28. * - mb_stripos - Finds position of first occurrence of a string within another, case insensitive
  29. * - mb_stristr - Finds first occurrence of a string within another, case insensitive
  30. * - mb_strrchr - Finds the last occurrence of a character in a string within another
  31. * - mb_strrichr - Finds the last occurrence of a character in a string within another, case insensitive
  32. * - mb_strripos - Finds position of last occurrence of a string within another, case insensitive
  33. * - mb_strstr - Finds first occurrence of a string within anothers
  34. *
  35. * Not implemented:
  36. * - mb_check_encoding - Check if the string is valid for the specified encoding
  37. * - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more)
  38. * - mb_convert_variables - Convert character code in variable(s)
  39. * - mb_decode_numericentity - Decode HTML numeric string reference to character
  40. * - mb_detect_encoding - Detect character encoding
  41. * - mb_detect_order - Set/Get character encoding detection order
  42. * - mb_encode_numericentity - Encode character to HTML numeric string reference
  43. * - mb_ereg* - Regular expression with multibyte support
  44. * - mb_get_info - Get internal settings of mbstring
  45. * - mb_http_input - Detect HTTP input character encoding
  46. * - mb_http_output - Set/Get HTTP output character encoding
  47. * - mb_language - Set/Get current language
  48. * - mb_list_encodings_alias_names - Returns an array of all supported alias encodings
  49. * - mb_list_mime_names - Returns an array or string of all supported mime names
  50. * - mb_output_handler - Callback function converts character encoding in output buffer
  51. * - mb_parse_str - Parse GET/POST/COOKIE data and set global variable
  52. * - mb_preferred_mime_name - Get MIME charset string
  53. * - mb_regex_encoding - Returns current encoding for multibyte regex as string
  54. * - mb_regex_set_options - Set/Get the default options for mbregex functions
  55. * - mb_send_mail - Send encoded mail
  56. * - mb_split - Split multibyte string using regular expression
  57. * - mb_strcut - Get part of string
  58. * - mb_strimwidth - Get truncated string with specified width
  59. * - mb_strwidth - Return width of string
  60. * - mb_substr_count - Count the number of substring occurrences
  61. */
  62. class Mbstring
  63. {
  64. const MB_CASE_FOLD = PHP_INT_MAX;
  65. protected static
  66. $internal_encoding = 'UTF-8',
  67. $caseFold = array(
  68. array('ΔΎ','?',"\xCD\x85",'?',"\xCF\x90","\xCF\x91","\xCF\x95","\xCF\x96","\xCF\xB0","\xCF\xB1","\xCF\xB5","\xE1\xBA\x9B","\xE1\xBE\xBE"),
  69. array('?','s','?', '?','?', '?', '?', '?', '?', '?', '?', "\xE1\xB9\xA1",'?' )
  70. );
  71. static function mb_convert_encoding($s, $to_encoding, $from_encoding = INF)
  72. {
  73. INF === $from_encoding && $from_encoding = self::$internal_encoding;
  74. if ('base64' === $from_encoding)
  75. {
  76. $s = base64_decode($s);
  77. $from_encoding = $to_encoding;
  78. }
  79. if ('base64' === $to_encoding) return base64_encode($s);
  80. if ('html-entities' === $to_encoding)
  81. {
  82. 'html-entities' === $from_encoding && $from_encoding = 'Windows-1252';
  83. 'utf-8' === $from_encoding || $s = iconv($from_encoding, 'UTF-8//IGNORE', $s);
  84. return preg_replace_callback('/[\x80-\xFF]+/', array(__CLASS__, 'html_encoding_callback'), $s);
  85. }
  86. if ('html-entities' === $from_encoding)
  87. {
  88. $s = html_entity_decode($s, ENT_COMPAT, 'UTF-8');
  89. $from_encoding = 'UTF-8';
  90. }
  91. return iconv($from_encoding, $to_encoding . '//IGNORE', $s);
  92. }
  93. static function mb_decode_mimeheader($s)
  94. {
  95. return iconv_mime_decode($s, 2, self::$internal_encoding . '//IGNORE');
  96. }
  97. static function mb_encode_mimeheader($s, $charset = INF, $transfer_encoding = INF, $linefeed = INF, $indent = INF)
  98. {
  99. user_error('mb_encode_mimeheader() is bugged. Please use iconv_mime_encode() instead.');
  100. }
  101. static function mb_convert_case($s, $mode, $encoding = INF)
  102. {
  103. if ('' === $s) return '';
  104. INF === $encoding && $encoding = self::$internal_encoding;
  105. if ('UTF-8' === strtoupper($encoding)) $encoding = INF;
  106. else $s = iconv($encoding, 'UTF-8//IGNORE', $s);
  107. if (MB_CASE_UPPER == $mode)
  108. {
  109. static $upper;
  110. isset($upper) || $upper = self::getData('upperCase');
  111. $map = $upper;
  112. }
  113. else
  114. {
  115. if (self::MB_CASE_FOLD === $mode) $s = str_replace(self::$caseFold[0], self::$caseFold[1], $s);
  116. static $lower;
  117. isset($lower) || $lower = self::getData('lowerCase');
  118. $map = $lower;
  119. }
  120. static $ulen_mask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4);
  121. $i = 0;
  122. $len = strlen($s);
  123. while ($i < $len)
  124. {
  125. $ulen = $s[$i] < "\x80" ? 1 : $ulen_mask[$s[$i] & "\xF0"];
  126. $uchr = substr($s, $i, $ulen);
  127. $i += $ulen;
  128. if (isset($map[$uchr]))
  129. {
  130. $uchr = $map[$uchr];
  131. $nlen = strlen($uchr);
  132. if ($nlen == $ulen)
  133. {
  134. $nlen = $i;
  135. do $s[--$nlen] = $uchr[--$ulen];
  136. while ($ulen);
  137. }
  138. else
  139. {
  140. $s = substr_replace($s, $uchr, $i, $ulen);
  141. $len += $nlen - $ulen;
  142. $i += $nlen - $ulen;
  143. }
  144. }
  145. }
  146. if (MB_CASE_TITLE == $mode)
  147. {
  148. $s = preg_replace_callback('/\b\p{Ll}/u', array(__CLASS__, 'title_case_callback'), $s);
  149. }
  150. if (INF === $encoding) return $s;
  151. else return iconv('UTF-8', $encoding, $s);
  152. }
  153. static function mb_internal_encoding($encoding = INF)
  154. {
  155. if (INF === $encoding) return self::$internal_encoding;
  156. if ('UTF-8' === strtoupper($encoding) || false !== @iconv($encoding, $encoding, ' '))
  157. {
  158. self::$internal_encoding = $encoding;
  159. return true;
  160. }
  161. return false;
  162. }
  163. static function mb_list_encodings()
  164. {
  165. return array('UTF-8');
  166. }
  167. static function mb_strlen($s, $encoding = INF)
  168. {
  169. INF === $encoding && $encoding = self::$internal_encoding;
  170. return iconv_strlen($s, $encoding . '//IGNORE');
  171. }
  172. static function mb_strpos ($haystack, $needle, $offset = 0, $encoding = INF)
  173. {
  174. INF === $encoding && $encoding = self::$internal_encoding;
  175. if ('' === (string) $needle)
  176. {
  177. user_error(__METHOD__ . ': Empty delimiter', E_USER_WARNING);
  178. return false;
  179. }
  180. else return iconv_strpos($haystack, $needle, $offset, $encoding . '//IGNORE');
  181. }
  182. static function mb_strrpos($haystack, $needle, $offset = 0, $encoding = INF)
  183. {
  184. INF === $encoding && $encoding = self::$internal_encoding;
  185. if ($offset != (int) $offset)
  186. {
  187. $offset = 0;
  188. }
  189. else if ($offset = (int) $offset)
  190. {
  191. $haystack = self::mb_substr($haystack, $offset, 2147483647, $encoding);
  192. }
  193. $pos = iconv_strrpos($haystack, $needle, $encoding . '//IGNORE');
  194. return false !== $pos ? $offset + $pos : false;
  195. }
  196. static function mb_strtolower($s, $encoding = INF)
  197. {
  198. return self::mb_convert_case($s, MB_CASE_LOWER, $encoding);
  199. }
  200. static function mb_strtoupper($s, $encoding = INF)
  201. {
  202. return self::mb_convert_case($s, MB_CASE_UPPER, $encoding);
  203. }
  204. static function mb_substitute_character($c = INF)
  205. {
  206. return INF !== $c ? false : 'none';
  207. }
  208. static function mb_substr($s, $start, $length = 2147483647, $encoding = INF)
  209. {
  210. INF === $encoding && $encoding = self::$internal_encoding;
  211. if ($start < 0)
  212. {
  213. $start = iconv_strlen($s, $encoding . '//IGNORE') + $start;
  214. if ($start < 0) $start = 0;
  215. }
  216. if ($length < 0)
  217. {
  218. $length = iconv_strlen($s, $encoding . '//IGNORE') + $length - $start;
  219. if ($length < 0) return '';
  220. }
  221. return (string) iconv_substr($s, $start, $length, $encoding . '//IGNORE');
  222. }
  223. static function mb_stripos($haystack, $needle, $offset = 0, $encoding = INF)
  224. {
  225. INF === $encoding && $encoding = self::$internal_encoding;
  226. $haystack = self::mb_convert_case($haystack, self::MB_CASE_FOLD, $encoding);
  227. $needle = self::mb_convert_case($needle, self::MB_CASE_FOLD, $encoding);
  228. return self::mb_strpos($haystack, $needle, $offset, $encoding);
  229. }
  230. static function mb_stristr($haystack, $needle, $part = false, $encoding = INF)
  231. {
  232. $pos = self::mb_stripos($haystack, $needle, $encoding);
  233. return self::getSubpart($pos, $part, $haystack, $encoding);
  234. }
  235. static function mb_strrchr($haystack, $needle, $part = false, $encoding = INF)
  236. {
  237. $needle = self::mb_substr($needle, 0, 1, $encoding);
  238. $pos = iconv_strrpos($haystack, $needle, $encoding);
  239. return self::getSubpart($pos, $part, $haystack, $encoding);
  240. }
  241. static function mb_strrichr($haystack, $needle, $part = false, $encoding = INF)
  242. {
  243. $needle = self::mb_substr($needle, 0, 1, $encoding);
  244. $pos = self::mb_strripos($haystack, $needle, $encoding);
  245. return self::getSubpart($pos, $part, $haystack, $encoding);
  246. }
  247. static function mb_strripos($haystack, $needle, $offset = 0, $encoding = INF)
  248. {
  249. INF === $encoding && $encoding = self::$internal_encoding;
  250. $haystack = self::mb_convert_case($haystack, self::MB_CASE_FOLD, $encoding);
  251. $needle = self::mb_convert_case($needle, self::MB_CASE_FOLD, $encoding);
  252. return self::mb_strrpos($haystack, $needle, $offset, $encoding);
  253. }
  254. static function mb_strstr($haystack, $needle, $part = false, $encoding = INF)
  255. {
  256. $pos = strpos($haystack, $needle);
  257. if (false === $pos) return false;
  258. if ($part) return substr($haystack, 0, $pos);
  259. else return substr($haystack, $pos);
  260. }
  261. protected static function getSubpart($pos, $part, $haystack, $encoding)
  262. {
  263. INF === $encoding && $encoding = self::$internal_encoding;
  264. if (false === $pos) return false;
  265. if ($part) return self::mb_substr($haystack, 0, $pos, $encoding);
  266. else return self::mb_substr($haystack, $pos, 2147483647, $encoding);
  267. }
  268. protected static function html_encoding_callback($m)
  269. {
  270. return htmlentities($m, ENT_COMPAT, 'UTF-8');
  271. }
  272. protected static function title_case_callback($s)
  273. {
  274. return self::mb_convert_case($s[0], MB_CASE_UPPER, 'UTF-8');
  275. }
  276. protected static function getData($file)
  277. {
  278. $file = __DIR__ . '/unidata/' . $file . '.ser';
  279. if (file_exists($file)) return unserialize(file_get_contents($file));
  280. else return false;
  281. }
  282. }