PageRenderTime 79ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/src/Ofire/vendor/php-lang-correct/UTF8.php

https://bitbucket.org/multifinger/ofire-utils
PHP | 4072 lines | 3218 code | 134 blank | 720 comment | 312 complexity | 9b23432c441cb479c84c2c67d66a973d MD5 | raw file
  1. <?php
  2. /**
  3. * PHP5 UTF-8 is a UTF-8 aware library of functions mirroring PHP's own string functions.
  4. *
  5. * The powerful solution/contribution for UTF-8 support in your framework/CMS, written on PHP.
  6. * This library is advance of http://sourceforge.net/projects/phputf8 (last updated in 2007).
  7. *
  8. * UTF-8 support in PHP 5.
  9. *
  10. * Features and benefits of using this class
  11. * * Compatibility with the interface standard PHP functions that deal with single-byte encodings
  12. * * Ability to work without PHP extensions ICONV and MBSTRING, if any, that are actively used!
  13. * * Useful features are missing from the ICONV and MBSTRING
  14. * * The methods that take and return a string, are able to take and return null (useful for selects from a database)
  15. * * Several methods are able to process arrays recursively
  16. * * A single interface and encapsulation (you can inherit and override)
  17. * * High performance, reliability and quality code
  18. * * PHP> = 5.3.x
  19. *
  20. * In Russian:
  21. *
  22. * Поддержка UTF-8 в PHP 5.
  23. *
  24. * Возможности и преимущества использования этого класса
  25. * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками
  26. * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются!
  27. * * Полезные функции, отсутствующие в ICONV и MBSTRING
  28. * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных)
  29. * * Несколько методов умеют обрабатывать массивы рекурсивно
  30. * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы)
  31. * * Высокая производительность, надёжность и качественный код
  32. * * PHP >= 5.3.x
  33. *
  34. * Example:
  35. * $s = 'Hello, Привет';
  36. * if (UTF8::is_utf8($s)) echo UTF8::strlen($s);
  37. *
  38. * UTF-8 encoding scheme:
  39. * 2^7 0x00000000 — 0x0000007F 0xxxxxxx
  40. * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx
  41. * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
  42. * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  43. * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864
  44. *
  45. * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-)
  46. *
  47. * Useful links
  48. * http://ru.wikipedia.org/wiki/UTF8
  49. * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page
  50. * http://www.unicode.org/
  51. * http://www.unicode.org/reports/
  52. * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm
  53. * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm
  54. * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode
  55. * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search
  56. *
  57. * @link http://code.google.com/p/php5-utf8/
  58. * @license http://creativecommons.org/licenses/by-sa/3.0/
  59. * @author Nasibullin Rinat
  60. * @version 2.2.2
  61. */
  62. class UTF8
  63. {
  64. #REPLACEMENT CHARACTER (for broken char)
  65. const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD
  66. /**
  67. * Regular expression for a character in UTF-8 without the use of a flag /u
  68. * @deprecated Instead, use a dot (".") and the flag /u, it works faster!
  69. * @var string
  70. */
  71. public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict
  72. # [\x00-\x7F] # ASCII non-strict (including control chars)
  73. | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
  74. | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
  75. | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
  76. | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
  77. | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
  78. | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
  79. | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
  80. ';
  81. /**
  82. * Combining diactrical marks (Unicode 5.1).
  83. *
  84. * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419),
  85. * decomposed form: (U+0415 U+0308), (U+0418 U+0306)
  86. *
  87. * @link http://www.unicode.org/charts/PDF/U0300.pdf
  88. * @link http://www.unicode.org/charts/PDF/U1DC0.pdf
  89. * @link http://www.unicode.org/charts/PDF/UFE20.pdf
  90. * @var string
  91. */
  92. #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag
  93. public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters)
  94. | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 U+20FF (for symbols)
  95. | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 U+1DFF (supplement)
  96. | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 U+FE2F (combining half marks)
  97. ';
  98. /**
  99. * @var array
  100. */
  101. public static $html_special_chars_table = array(
  102. '&quot;' => "\x22", #U+0022 ["] &#34; quotation mark = APL quote
  103. '&amp;' => "\x26", #U+0026 [&] &#38; ampersand
  104. '&lt;' => "\x3c", #U+003C [<] &#60; less-than sign
  105. '&gt;' => "\x3e", #U+003E [>] &#62; greater-than sign
  106. );
  107. /**
  108. * @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page
  109. * @var array
  110. */
  111. public static $html_entity_table = array(
  112. #Latin-1 Entities:
  113. '&nbsp;' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space
  114. '&iexcl;' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark
  115. '&cent;' => "\xc2\xa2", #U+00A2 [¢] cent sign
  116. '&pound;' => "\xc2\xa3", #U+00A3 [£] pound sign
  117. '&curren;' => "\xc2\xa4", #U+00A4 [¤] currency sign
  118. '&yen;' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign
  119. '&brvbar;' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar
  120. '&sect;' => "\xc2\xa7", #U+00A7 [§] section sign
  121. '&uml;' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis
  122. '&copy;' => "\xc2\xa9", #U+00A9 [©] copyright sign
  123. '&ordf;' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator
  124. '&laquo;' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet
  125. '&not;' => "\xc2\xac", #U+00AC [¬] not sign
  126. '&shy;' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen
  127. '&reg;' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign
  128. '&macr;' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar
  129. '&deg;' => "\xc2\xb0", #U+00B0 [°] degree sign
  130. '&plusmn;' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign
  131. '&sup2;' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared
  132. '&sup3;' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed
  133. '&acute;' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute
  134. '&micro;' => "\xc2\xb5", #U+00B5 [µ] micro sign
  135. '&para;' => "\xc2\xb6", #U+00B6 [] pilcrow sign = paragraph sign
  136. '&middot;' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot
  137. '&cedil;' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla
  138. '&sup1;' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one
  139. '&ordm;' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator
  140. '&raquo;' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet
  141. '&frac14;' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter
  142. '&frac12;' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half
  143. '&frac34;' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters
  144. '&iquest;' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark
  145. #Latin capital letter
  146. '&Agrave;' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave
  147. '&Aacute;' => "\xc3\x81", #Latin capital letter A with acute
  148. '&Acirc;' => "\xc3\x82", #Latin capital letter A with circumflex
  149. '&Atilde;' => "\xc3\x83", #Latin capital letter A with tilde
  150. '&Auml;' => "\xc3\x84", #Latin capital letter A with diaeresis
  151. '&Aring;' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring
  152. '&AElig;' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE
  153. '&Ccedil;' => "\xc3\x87", #Latin capital letter C with cedilla
  154. '&Egrave;' => "\xc3\x88", #Latin capital letter E with grave
  155. '&Eacute;' => "\xc3\x89", #Latin capital letter E with acute
  156. '&Ecirc;' => "\xc3\x8a", #Latin capital letter E with circumflex
  157. '&Euml;' => "\xc3\x8b", #Latin capital letter E with diaeresis
  158. '&Igrave;' => "\xc3\x8c", #Latin capital letter I with grave
  159. '&Iacute;' => "\xc3\x8d", #Latin capital letter I with acute
  160. '&Icirc;' => "\xc3\x8e", #Latin capital letter I with circumflex
  161. '&Iuml;' => "\xc3\x8f", #Latin capital letter I with diaeresis
  162. '&ETH;' => "\xc3\x90", #Latin capital letter ETH
  163. '&Ntilde;' => "\xc3\x91", #Latin capital letter N with tilde
  164. '&Ograve;' => "\xc3\x92", #Latin capital letter O with grave
  165. '&Oacute;' => "\xc3\x93", #Latin capital letter O with acute
  166. '&Ocirc;' => "\xc3\x94", #Latin capital letter O with circumflex
  167. '&Otilde;' => "\xc3\x95", #Latin capital letter O with tilde
  168. '&Ouml;' => "\xc3\x96", #Latin capital letter O with diaeresis
  169. '&times;' => "\xc3\x97", #U+00D7 [×] multiplication sign
  170. '&Oslash;' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash
  171. '&Ugrave;' => "\xc3\x99", #Latin capital letter U with grave
  172. '&Uacute;' => "\xc3\x9a", #Latin capital letter U with acute
  173. '&Ucirc;' => "\xc3\x9b", #Latin capital letter U with circumflex
  174. '&Uuml;' => "\xc3\x9c", #Latin capital letter U with diaeresis
  175. '&Yacute;' => "\xc3\x9d", #Latin capital letter Y with acute
  176. '&THORN;' => "\xc3\x9e", #Latin capital letter THORN
  177. #Latin small letter
  178. '&szlig;' => "\xc3\x9f", #Latin small letter sharp s = ess-zed
  179. '&agrave;' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave
  180. '&aacute;' => "\xc3\xa1", #Latin small letter a with acute
  181. '&acirc;' => "\xc3\xa2", #Latin small letter a with circumflex
  182. '&atilde;' => "\xc3\xa3", #Latin small letter a with tilde
  183. '&auml;' => "\xc3\xa4", #Latin small letter a with diaeresis
  184. '&aring;' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring
  185. '&aelig;' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae
  186. '&ccedil;' => "\xc3\xa7", #Latin small letter c with cedilla
  187. '&egrave;' => "\xc3\xa8", #Latin small letter e with grave
  188. '&eacute;' => "\xc3\xa9", #Latin small letter e with acute
  189. '&ecirc;' => "\xc3\xaa", #Latin small letter e with circumflex
  190. '&euml;' => "\xc3\xab", #Latin small letter e with diaeresis
  191. '&igrave;' => "\xc3\xac", #Latin small letter i with grave
  192. '&iacute;' => "\xc3\xad", #Latin small letter i with acute
  193. '&icirc;' => "\xc3\xae", #Latin small letter i with circumflex
  194. '&iuml;' => "\xc3\xaf", #Latin small letter i with diaeresis
  195. '&eth;' => "\xc3\xb0", #Latin small letter eth
  196. '&ntilde;' => "\xc3\xb1", #Latin small letter n with tilde
  197. '&ograve;' => "\xc3\xb2", #Latin small letter o with grave
  198. '&oacute;' => "\xc3\xb3", #Latin small letter o with acute
  199. '&ocirc;' => "\xc3\xb4", #Latin small letter o with circumflex
  200. '&otilde;' => "\xc3\xb5", #Latin small letter o with tilde
  201. '&ouml;' => "\xc3\xb6", #Latin small letter o with diaeresis
  202. '&divide;' => "\xc3\xb7", #U+00F7 [÷] division sign
  203. '&oslash;' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash
  204. '&ugrave;' => "\xc3\xb9", #Latin small letter u with grave
  205. '&uacute;' => "\xc3\xba", #Latin small letter u with acute
  206. '&ucirc;' => "\xc3\xbb", #Latin small letter u with circumflex
  207. '&uuml;' => "\xc3\xbc", #Latin small letter u with diaeresis
  208. '&yacute;' => "\xc3\xbd", #Latin small letter y with acute
  209. '&thorn;' => "\xc3\xbe", #Latin small letter thorn
  210. '&yuml;' => "\xc3\xbf", #Latin small letter y with diaeresis
  211. #Symbols and Greek Letters:
  212. '&fnof;' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin
  213. '&Alpha;' => "\xce\x91", #Greek capital letter alpha
  214. '&Beta;' => "\xce\x92", #Greek capital letter beta
  215. '&Gamma;' => "\xce\x93", #Greek capital letter gamma
  216. '&Delta;' => "\xce\x94", #Greek capital letter delta
  217. '&Epsilon;' => "\xce\x95", #Greek capital letter epsilon
  218. '&Zeta;' => "\xce\x96", #Greek capital letter zeta
  219. '&Eta;' => "\xce\x97", #Greek capital letter eta
  220. '&Theta;' => "\xce\x98", #Greek capital letter theta
  221. '&Iota;' => "\xce\x99", #Greek capital letter iota
  222. '&Kappa;' => "\xce\x9a", #Greek capital letter kappa
  223. '&Lambda;' => "\xce\x9b", #Greek capital letter lambda
  224. '&Mu;' => "\xce\x9c", #Greek capital letter mu
  225. '&Nu;' => "\xce\x9d", #Greek capital letter nu
  226. '&Xi;' => "\xce\x9e", #Greek capital letter xi
  227. '&Omicron;' => "\xce\x9f", #Greek capital letter omicron
  228. '&Pi;' => "\xce\xa0", #Greek capital letter pi
  229. '&Rho;' => "\xce\xa1", #Greek capital letter rho
  230. '&Sigma;' => "\xce\xa3", #Greek capital letter sigma
  231. '&Tau;' => "\xce\xa4", #Greek capital letter tau
  232. '&Upsilon;' => "\xce\xa5", #Greek capital letter upsilon
  233. '&Phi;' => "\xce\xa6", #Greek capital letter phi
  234. '&Chi;' => "\xce\xa7", #Greek capital letter chi
  235. '&Psi;' => "\xce\xa8", #Greek capital letter psi
  236. '&Omega;' => "\xce\xa9", #Greek capital letter omega
  237. '&alpha;' => "\xce\xb1", #Greek small letter alpha
  238. '&beta;' => "\xce\xb2", #Greek small letter beta
  239. '&gamma;' => "\xce\xb3", #Greek small letter gamma
  240. '&delta;' => "\xce\xb4", #Greek small letter delta
  241. '&epsilon;' => "\xce\xb5", #Greek small letter epsilon
  242. '&zeta;' => "\xce\xb6", #Greek small letter zeta
  243. '&eta;' => "\xce\xb7", #Greek small letter eta
  244. '&theta;' => "\xce\xb8", #Greek small letter theta
  245. '&iota;' => "\xce\xb9", #Greek small letter iota
  246. '&kappa;' => "\xce\xba", #Greek small letter kappa
  247. '&lambda;' => "\xce\xbb", #Greek small letter lambda
  248. '&mu;' => "\xce\xbc", #Greek small letter mu
  249. '&nu;' => "\xce\xbd", #Greek small letter nu
  250. '&xi;' => "\xce\xbe", #Greek small letter xi
  251. '&omicron;' => "\xce\xbf", #Greek small letter omicron
  252. '&pi;' => "\xcf\x80", #Greek small letter pi
  253. '&rho;' => "\xcf\x81", #Greek small letter rho
  254. '&sigmaf;' => "\xcf\x82", #Greek small letter final sigma
  255. '&sigma;' => "\xcf\x83", #Greek small letter sigma
  256. '&tau;' => "\xcf\x84", #Greek small letter tau
  257. '&upsilon;' => "\xcf\x85", #Greek small letter upsilon
  258. '&phi;' => "\xcf\x86", #Greek small letter phi
  259. '&chi;' => "\xcf\x87", #Greek small letter chi
  260. '&psi;' => "\xcf\x88", #Greek small letter psi
  261. '&omega;' => "\xcf\x89", #Greek small letter omega
  262. '&thetasym;'=> "\xcf\x91", #Greek small letter theta symbol
  263. '&upsih;' => "\xcf\x92", #Greek upsilon with hook symbol
  264. '&piv;' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol
  265. '&bull;' => "\xe2\x80\xa2", #U+2022 [] bullet = black small circle
  266. '&hellip;' => "\xe2\x80\xa6", #U+2026 [] horizontal ellipsis = three dot leader
  267. '&prime;' => "\xe2\x80\xb2", #U+2032 [] prime = minutes = feet (для обозначения минут и футов)
  268. '&Prime;' => "\xe2\x80\xb3", #U+2033 [] double prime = seconds = inches (для обозначения секунд и дюймов).
  269. '&oline;' => "\xe2\x80\xbe", #U+203E [] overline = spacing overscore
  270. '&frasl;' => "\xe2\x81\x84", #U+2044 [] fraction slash
  271. '&weierp;' => "\xe2\x84\x98", #U+2118 [] script capital P = power set = Weierstrass p
  272. '&image;' => "\xe2\x84\x91", #U+2111 [] blackletter capital I = imaginary part
  273. '&real;' => "\xe2\x84\x9c", #U+211C [] blackletter capital R = real part symbol
  274. '&trade;' => "\xe2\x84\xa2", #U+2122 [] trade mark sign
  275. '&alefsym;' => "\xe2\x84\xb5", #U+2135 [] alef symbol = first transfinite cardinal
  276. '&larr;' => "\xe2\x86\x90", #U+2190 [] leftwards arrow
  277. '&uarr;' => "\xe2\x86\x91", #U+2191 [] upwards arrow
  278. '&rarr;' => "\xe2\x86\x92", #U+2192 [] rightwards arrow
  279. '&darr;' => "\xe2\x86\x93", #U+2193 [] downwards arrow
  280. '&harr;' => "\xe2\x86\x94", #U+2194 [] left right arrow
  281. '&crarr;' => "\xe2\x86\xb5", #U+21B5 [] downwards arrow with corner leftwards = carriage return
  282. '&lArr;' => "\xe2\x87\x90", #U+21D0 [] leftwards double arrow
  283. '&uArr;' => "\xe2\x87\x91", #U+21D1 [] upwards double arrow
  284. '&rArr;' => "\xe2\x87\x92", #U+21D2 [] rightwards double arrow
  285. '&dArr;' => "\xe2\x87\x93", #U+21D3 [] downwards double arrow
  286. '&hArr;' => "\xe2\x87\x94", #U+21D4 [] left right double arrow
  287. '&forall;' => "\xe2\x88\x80", #U+2200 [] for all
  288. '&part;' => "\xe2\x88\x82", #U+2202 [] partial differential
  289. '&exist;' => "\xe2\x88\x83", #U+2203 [] there exists
  290. '&empty;' => "\xe2\x88\x85", #U+2205 [] empty set = null set = diameter
  291. '&nabla;' => "\xe2\x88\x87", #U+2207 [] nabla = backward difference
  292. '&isin;' => "\xe2\x88\x88", #U+2208 [] element of
  293. '&notin;' => "\xe2\x88\x89", #U+2209 [] not an element of
  294. '&ni;' => "\xe2\x88\x8b", #U+220B [] contains as member
  295. '&prod;' => "\xe2\x88\x8f", #U+220F [] n-ary product = product sign
  296. '&sum;' => "\xe2\x88\x91", #U+2211 [] n-ary sumation
  297. '&minus;' => "\xe2\x88\x92", #U+2212 [] minus sign
  298. '&lowast;' => "\xe2\x88\x97", #U+2217 [] asterisk operator
  299. '&radic;' => "\xe2\x88\x9a", #U+221A [] square root = radical sign
  300. '&prop;' => "\xe2\x88\x9d", #U+221D [] proportional to
  301. '&infin;' => "\xe2\x88\x9e", #U+221E [] infinity
  302. '&ang;' => "\xe2\x88\xa0", #U+2220 [] angle
  303. '&and;' => "\xe2\x88\xa7", #U+2227 [] logical and = wedge
  304. '&or;' => "\xe2\x88\xa8", #U+2228 [] logical or = vee
  305. '&cap;' => "\xe2\x88\xa9", #U+2229 [] intersection = cap
  306. '&cup;' => "\xe2\x88\xaa", #U+222A [] union = cup
  307. '&int;' => "\xe2\x88\xab", #U+222B [] integral
  308. '&there4;' => "\xe2\x88\xb4", #U+2234 [] therefore
  309. '&sim;' => "\xe2\x88\xbc", #U+223C [] tilde operator = varies with = similar to
  310. '&cong;' => "\xe2\x89\x85", #U+2245 [] approximately equal to
  311. '&asymp;' => "\xe2\x89\x88", #U+2248 [] almost equal to = asymptotic to
  312. '&ne;' => "\xe2\x89\xa0", #U+2260 [] not equal to
  313. '&equiv;' => "\xe2\x89\xa1", #U+2261 [] identical to
  314. '&le;' => "\xe2\x89\xa4", #U+2264 [] less-than or equal to
  315. '&ge;' => "\xe2\x89\xa5", #U+2265 [] greater-than or equal to
  316. '&sub;' => "\xe2\x8a\x82", #U+2282 [] subset of
  317. '&sup;' => "\xe2\x8a\x83", #U+2283 [] superset of
  318. '&nsub;' => "\xe2\x8a\x84", #U+2284 [] not a subset of
  319. '&sube;' => "\xe2\x8a\x86", #U+2286 [] subset of or equal to
  320. '&supe;' => "\xe2\x8a\x87", #U+2287 [] superset of or equal to
  321. '&oplus;' => "\xe2\x8a\x95", #U+2295 [] circled plus = direct sum
  322. '&otimes;' => "\xe2\x8a\x97", #U+2297 [] circled times = vector product
  323. '&perp;' => "\xe2\x8a\xa5", #U+22A5 [] up tack = orthogonal to = perpendicular
  324. '&sdot;' => "\xe2\x8b\x85", #U+22C5 [] dot operator
  325. '&lceil;' => "\xe2\x8c\x88", #U+2308 [] left ceiling = APL upstile
  326. '&rceil;' => "\xe2\x8c\x89", #U+2309 [] right ceiling
  327. '&lfloor;' => "\xe2\x8c\x8a", #U+230A [] left floor = APL downstile
  328. '&rfloor;' => "\xe2\x8c\x8b", #U+230B [] right floor
  329. '&lang;' => "\xe2\x8c\xa9", #U+2329 [] left-pointing angle bracket = bra
  330. '&rang;' => "\xe2\x8c\xaa", #U+232A [] right-pointing angle bracket = ket
  331. '&loz;' => "\xe2\x97\x8a", #U+25CA [] lozenge
  332. '&spades;' => "\xe2\x99\xa0", #U+2660 [] black spade suit
  333. '&clubs;' => "\xe2\x99\xa3", #U+2663 [] black club suit = shamrock
  334. '&hearts;' => "\xe2\x99\xa5", #U+2665 [] black heart suit = valentine
  335. '&diams;' => "\xe2\x99\xa6", #U+2666 [] black diamond suit
  336. #Other Special Characters:
  337. '&OElig;' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE
  338. '&oelig;' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe
  339. '&Scaron;' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron
  340. '&scaron;' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron
  341. '&Yuml;' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis
  342. '&circ;' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent
  343. '&tilde;' => "\xcb\x9c", #U+02DC [˜] small tilde
  344. '&ensp;' => "\xe2\x80\x82", #U+2002 [] en space
  345. '&emsp;' => "\xe2\x80\x83", #U+2003 [] em space
  346. '&thinsp;' => "\xe2\x80\x89", #U+2009 [] thin space
  347. '&zwnj;' => "\xe2\x80\x8c", #U+200C [] zero width non-joiner
  348. '&zwj;' => "\xe2\x80\x8d", #U+200D [] zero width joiner
  349. '&lrm;' => "\xe2\x80\x8e", #U+200E [] left-to-right mark
  350. '&rlm;' => "\xe2\x80\x8f", #U+200F [] right-to-left mark
  351. '&ndash;' => "\xe2\x80\x93", #U+2013 [] en dash
  352. '&mdash;' => "\xe2\x80\x94", #U+2014 [] em dash
  353. '&lsquo;' => "\xe2\x80\x98", #U+2018 [] left single quotation mark
  354. '&rsquo;' => "\xe2\x80\x99", #U+2019 [] right single quotation mark (and apostrophe!)
  355. '&sbquo;' => "\xe2\x80\x9a", #U+201A [] single low-9 quotation mark
  356. '&ldquo;' => "\xe2\x80\x9c", #U+201C [] left double quotation mark
  357. '&rdquo;' => "\xe2\x80\x9d", #U+201D [] right double quotation mark
  358. '&bdquo;' => "\xe2\x80\x9e", #U+201E [] double low-9 quotation mark
  359. '&dagger;' => "\xe2\x80\xa0", #U+2020 [] dagger
  360. '&Dagger;' => "\xe2\x80\xa1", #U+2021 [] double dagger
  361. '&permil;' => "\xe2\x80\xb0", #U+2030 [] per mille sign
  362. '&lsaquo;' => "\xe2\x80\xb9", #U+2039 [] single left-pointing angle quotation mark
  363. '&rsaquo;' => "\xe2\x80\xba", #U+203A [] single right-pointing angle quotation mark
  364. '&euro;' => "\xe2\x82\xac", #U+20AC [] euro sign
  365. );
  366. /**
  367. * This table contains the data on how cp1259 characters map into Unicode (UTF-8).
  368. * The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table.
  369. * cp1259 -- this is an outdated one byte encoding of the Tatar language,
  370. * which includes all the Russian letters from cp1251.
  371. *
  372. * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz
  373. * @link http://www.unicode.org/charts/PDF/U0400.pdf
  374. */
  375. public static $cp1259_table = array(
  376. #bytes from 0x00 to 0x7F (ASCII) saved as is
  377. "\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA
  378. "\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE
  379. "\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK
  380. "\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE
  381. "\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK
  382. "\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS
  383. "\x86" => "\xe2\x80\xa0", #U+2020 DAGGER
  384. "\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER
  385. "\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN
  386. "\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN
  387. "\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O
  388. "\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  389. "\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U
  390. "\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
  391. "\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK
  392. "\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA
  393. "\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA
  394. "\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK
  395. "\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK
  396. "\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK
  397. "\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK
  398. "\x95" => "\xe2\x80\xa2", #U+2022 BULLET
  399. "\x96" => "\xe2\x80\x93", #U+2013 EN DASH
  400. "\x97" => "\xe2\x80\x94", #U+2014 EM DASH
  401. #"\x98" #UNDEFINED
  402. "\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN
  403. "\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O
  404. "\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  405. "\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U
  406. "\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER
  407. "\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK
  408. "\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA
  409. "\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE
  410. "\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U
  411. "\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U
  412. "\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE
  413. "\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN
  414. "\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN
  415. "\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR
  416. "\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN
  417. "\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO
  418. "\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN
  419. "\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE
  420. "\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  421. "\xac" => "\xc2\xac", #U+00ac NOT SIGN
  422. "\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN
  423. "\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN
  424. "\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI
  425. "\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN
  426. "\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN
  427. "\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
  428. "\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
  429. "\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN
  430. "\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN
  431. "\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN
  432. "\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT
  433. "\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO
  434. "\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN
  435. "\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE
  436. "\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
  437. "\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE
  438. "\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE
  439. "\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE
  440. "\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI
  441. "\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A
  442. "\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE
  443. "\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE
  444. "\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE
  445. "\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE
  446. "\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE
  447. "\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE
  448. "\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE
  449. "\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I
  450. "\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I
  451. "\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA
  452. "\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL
  453. "\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM
  454. "\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN
  455. "\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O
  456. "\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE
  457. "\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER
  458. "\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES
  459. "\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE
  460. "\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U
  461. "\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF
  462. "\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA
  463. "\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE
  464. "\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE
  465. "\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA
  466. "\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA
  467. "\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN
  468. "\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU
  469. "\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN
  470. "\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E
  471. "\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU
  472. "\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA
  473. "\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A
  474. "\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE
  475. "\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE
  476. "\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE
  477. "\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE
  478. "\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE
  479. "\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE
  480. "\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE
  481. "\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I
  482. "\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I
  483. "\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA
  484. "\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL
  485. "\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM
  486. "\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN
  487. "\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O
  488. "\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE
  489. "\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER
  490. "\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES
  491. "\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE
  492. "\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U
  493. "\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF
  494. "\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA
  495. "\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE
  496. "\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE
  497. "\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA
  498. "\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA
  499. "\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN
  500. "\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU
  501. "\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN
  502. "\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E
  503. "\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU
  504. "\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA
  505. );
  506. /**
  507. * UTF-8 Case lookup table
  508. *
  509. * This lookuptable defines the upper case letters to their correspponding
  510. * lower case letter in UTF-8
  511. *
  512. * @author Andreas Gohr <andi@splitbrain.org>
  513. */
  514. public static $convert_case_table = array(
  515. #CASE_UPPER => case_lower
  516. "\x41" => "\x61", #A a
  517. "\x42" => "\x62", #B b
  518. "\x43" => "\x63", #C c
  519. "\x44" => "\x64", #D d
  520. "\x45" => "\x65", #E e
  521. "\x46" => "\x66", #F f
  522. "\x47" => "\x67", #G g
  523. "\x48" => "\x68", #H h
  524. "\x49" => "\x69", #I i
  525. "\x4a" => "\x6a", #J j
  526. "\x4b" => "\x6b", #K k
  527. "\x4c" => "\x6c", #L l
  528. "\x4d" => "\x6d", #M m
  529. "\x4e" => "\x6e", #N n
  530. "\x4f" => "\x6f", #O o
  531. "\x50" => "\x70", #P p
  532. "\x51" => "\x71", #Q q
  533. "\x52" => "\x72", #R r
  534. "\x53" => "\x73", #S s
  535. "\x54" => "\x74", #T t
  536. "\x55" => "\x75", #U u
  537. "\x56" => "\x76", #V v
  538. "\x57" => "\x77", #W w
  539. "\x58" => "\x78", #X x
  540. "\x59" => "\x79", #Y y
  541. "\x5a" => "\x7a", #Z z
  542. "\xc3\x80" => "\xc3\xa0",
  543. "\xc3\x81" => "\xc3\xa1",
  544. "\xc3\x82" => "\xc3\xa2",
  545. "\xc3\x83" => "\xc3\xa3",
  546. "\xc3\x84" => "\xc3\xa4",
  547. "\xc3\x85" => "\xc3\xa5",
  548. "\xc3\x86" => "\xc3\xa6",
  549. "\xc3\x87" => "\xc3\xa7",
  550. "\xc3\x88" => "\xc3\xa8",
  551. "\xc3\x89" => "\xc3\xa9",
  552. "\xc3\x8a" => "\xc3\xaa",
  553. "\xc3\x8b" => "\xc3\xab",
  554. "\xc3\x8c" => "\xc3\xac",
  555. "\xc3\x8d" => "\xc3\xad",
  556. "\xc3\x8e" => "\xc3\xae",
  557. "\xc3\x8f" => "\xc3\xaf",
  558. "\xc3\x90" => "\xc3\xb0",
  559. "\xc3\x91" => "\xc3\xb1",
  560. "\xc3\x92" => "\xc3\xb2",
  561. "\xc3\x93" => "\xc3\xb3",
  562. "\xc3\x94" => "\xc3\xb4",
  563. "\xc3\x95" => "\xc3\xb5",
  564. "\xc3\x96" => "\xc3\xb6",
  565. "\xc3\x98" => "\xc3\xb8",
  566. "\xc3\x99" => "\xc3\xb9",
  567. "\xc3\x9a" => "\xc3\xba",
  568. "\xc3\x9b" => "\xc3\xbb",
  569. "\xc3\x9c" => "\xc3\xbc",
  570. "\xc3\x9d" => "\xc3\xbd",
  571. "\xc3\x9e" => "\xc3\xbe",
  572. "\xc4\x80" => "\xc4\x81",
  573. "\xc4\x82" => "\xc4\x83",
  574. "\xc4\x84" => "\xc4\x85",
  575. "\xc4\x86" => "\xc4\x87",
  576. "\xc4\x88" => "\xc4\x89",
  577. "\xc4\x8a" => "\xc4\x8b",
  578. "\xc4\x8c" => "\xc4\x8d",
  579. "\xc4\x8e" => "\xc4\x8f",
  580. "\xc4\x90" => "\xc4\x91",
  581. "\xc4\x92" => "\xc4\x93",
  582. "\xc4\x94" => "\xc4\x95",
  583. "\xc4\x96" => "\xc4\x97",
  584. "\xc4\x98" => "\xc4\x99",
  585. "\xc4\x9a" => "\xc4\x9b",
  586. "\xc4\x9c" => "\xc4\x9d",
  587. "\xc4\x9e" => "\xc4\x9f",
  588. "\xc4\xa0" => "\xc4\xa1",
  589. "\xc4\xa2" => "\xc4\xa3",
  590. "\xc4\xa4" => "\xc4\xa5",
  591. "\xc4\xa6" => "\xc4\xa7",
  592. "\xc4\xa8" => "\xc4\xa9",
  593. "\xc4\xaa" => "\xc4\xab",
  594. "\xc4\xac" => "\xc4\xad",
  595. "\xc4\xae" => "\xc4\xaf",
  596. "\xc4\xb2" => "\xc4\xb3",
  597. "\xc4\xb4" => "\xc4\xb5",
  598. "\xc4\xb6" => "\xc4\xb7",
  599. "\xc4\xb9" => "\xc4\xba",
  600. "\xc4\xbb" => "\xc4\xbc",
  601. "\xc4\xbd" => "\xc4\xbe",
  602. "\xc4\xbf" => "\xc5\x80",
  603. "\xc5\x81" => "\xc5\x82",
  604. "\xc5\x83" => "\xc5\x84",
  605. "\xc5\x85" => "\xc5\x86",
  606. "\xc5\x87" => "\xc5\x88",
  607. "\xc5\x8a" => "\xc5\x8b",
  608. "\xc5\x8c" => "\xc5\x8d",
  609. "\xc5\x8e" => "\xc5\x8f",
  610. "\xc5\x90" => "\xc5\x91",
  611. "\xc5\x92" => "\xc5\x93",
  612. "\xc5\x94" => "\xc5\x95",
  613. "\xc5\x96" => "\xc5\x97",
  614. "\xc5\x98" => "\xc5\x99",
  615. "\xc5\x9a" => "\xc5\x9b",
  616. "\xc5\x9c" => "\xc5\x9d",
  617. "\xc5\x9e" => "\xc5\x9f",
  618. "\xc5\xa0" => "\xc5\xa1",
  619. "\xc5\xa2" => "\xc5\xa3",
  620. "\xc5\xa4" => "\xc5\xa5",
  621. "\xc5\xa6" => "\xc5\xa7",
  622. "\xc5\xa8" => "\xc5\xa9",
  623. "\xc5\xaa" => "\xc5\xab",
  624. "\xc5\xac" => "\xc5\xad",
  625. "\xc5\xae" => "\xc5\xaf",
  626. "\xc5\xb0" => "\xc5\xb1",
  627. "\xc5\xb2" => "\xc5\xb3",
  628. "\xc5\xb4" => "\xc5\xb5",
  629. "\xc5\xb6" => "\xc5\xb7",
  630. "\xc5\xb8" => "\xc3\xbf",
  631. "\xc5\xb9" => "\xc5\xba",
  632. "\xc5\xbb" => "\xc5\xbc",
  633. "\xc5\xbd" => "\xc5\xbe",
  634. "\xc6\x81" => "\xc9\x93",
  635. "\xc6\x82" => "\xc6\x83",
  636. "\xc6\x84" => "\xc6\x85",
  637. "\xc6\x86" => "\xc9\x94",
  638. "\xc6\x87" => "\xc6\x88",
  639. "\xc6\x89" => "\xc9\x96",
  640. "\xc6\x8a" => "\xc9\x97",
  641. "\xc6\x8b" => "\xc6\x8c",
  642. "\xc6\x8e" => "\xc7\x9d",
  643. "\xc6\x8f" => "\xc9\x99",
  644. "\xc6\x90" => "\xc9\x9b",
  645. "\xc6\x91" => "\xc6\x92",
  646. "\xc6\x94" => "\xc9\xa3",
  647. "\xc6\x96" => "\xc9\xa9",
  648. "\xc6\x97" => "\xc9\xa8",
  649. "\xc6\x98" => "\xc6\x99",
  650. "\xc6\x9c" => "\xc9\xaf",
  651. "\xc6\x9d" => "\xc9\xb2",
  652. "\xc6\x9f" => "\xc9\xb5",
  653. "\xc6\xa0" => "\xc6\xa1",
  654. "\xc6\xa2" => "\xc6\xa3",
  655. "\xc6\xa4" => "\xc6\xa5",
  656. "\xc6\xa6" => "\xca\x80",
  657. "\xc6\xa7" => "\xc6\xa8",
  658. "\xc6\xa9" => "\xca\x83",
  659. "\xc6\xac" => "\xc6\xad",
  660. "\xc6\xae" => "\xca\x88",
  661. "\xc6\xaf" => "\xc6\xb0",
  662. "\xc6\xb1" => "\xca\x8a",
  663. "\xc6\xb2" => "\xca\x8b",
  664. "\xc6\xb3" => "\xc6\xb4",
  665. "\xc6\xb5" => "\xc6\xb6",
  666. "\xc6\xb7" => "\xca\x92",
  667. "\xc6\xb8" => "\xc6\xb9",
  668. "\xc6\xbc" => "\xc6\xbd",
  669. "\xc7\x85" => "\xc7\x86",
  670. "\xc7\x88" => "\xc7\x89",
  671. "\xc7\x8b" => "\xc7\x8c",
  672. "\xc7\x8d" => "\xc7\x8e",
  673. "\xc7\x8f" => "\xc7\x90",
  674. "\xc7\x91" => "\xc7\x92",
  675. "\xc7\x93" => "\xc7\x94",
  676. "\xc7\x95" => "\xc7\x96",
  677. "\xc7\x97" => "\xc7\x98",
  678. "\xc7\x99" => "\xc7\x9a",
  679. "\xc7\x9b" => "\xc7\x9c",
  680. "\xc7\x9e" => "\xc7\x9f",
  681. "\xc7\xa0" => "\xc7\xa1",
  682. "\xc7\xa2" => "\xc7\xa3",
  683. "\xc7\xa4" => "\xc7\xa5",
  684. "\xc7\xa6" => "\xc7\xa7",
  685. "\xc7\xa8" => "\xc7\xa9",
  686. "\xc7\xaa" => "\xc7\xab",
  687. "\xc7\xac" => "\xc7\xad",
  688. "\xc7\xae" => "\xc7\xaf",
  689. "\xc7\xb2" => "\xc7\xb3",
  690. "\xc7\xb4" => "\xc7\xb5",
  691. "\xc7\xb6" => "\xc6\x95",
  692. "\xc7\xb7" => "\xc6\xbf",
  693. "\xc7\xb8" => "\xc7\xb9",
  694. "\xc7\xba" => "\xc7\xbb",
  695. "\xc7\xbc" => "\xc7\xbd",
  696. "\xc7\xbe" => "\xc7\xbf",
  697. "\xc8\x80" => "\xc8\x81",
  698. "\xc8\x82" => "\xc8\x83",
  699. "\xc8\x84" => "\xc8\x85",
  700. "\xc8\x86" => "\xc8\x87",
  701. "\xc8\x88" => "\xc8\x89",
  702. "\xc8\x8a" => "\xc8\x8b",
  703. "\xc8\x8c" => "\xc8\x8d",
  704. "\xc8\x8e" => "\xc8\x8f",
  705. "\xc8\x90" => "\xc8\x91",
  706. "\xc8\x92" => "\xc8\x93",
  707. "\xc8\x94" => "\xc8\x95",
  708. "\xc8\x96" => "\xc8\x97",
  709. "\xc8\x98" => "\xc8\x99",
  710. "\xc8\x9a" => "\xc8\x9b",
  711. "\xc8\x9c" => "\xc8\x9d",
  712. "\xc8\x9e" => "\xc8\x9f",
  713. "\xc8\xa0" => "\xc6\x9e",
  714. "\xc8\xa2" => "\xc8\xa3",
  715. "\xc8\xa4" => "\xc8\xa5",
  716. "\xc8\xa6" => "\xc8\xa7",
  717. "\xc8\xa8" => "\xc8\xa9",
  718. "\xc8\xaa" => "\xc8\xab",
  719. "\xc8\xac" => "\xc8\xad",
  720. "\xc8\xae" => "\xc8\xaf",
  721. "\xc8\xb0" => "\xc8\xb1",
  722. "\xc8\xb2" => "\xc8\xb3",
  723. "\xce\x86" => "\xce\xac",
  724. "\xce\x88" => "\xce\xad",
  725. "\xce\x89" => "\xce\xae",
  726. "\xce\x8a" => "\xce\xaf",
  727. "\xce\x8c" => "\xcf\x8c",
  728. "\xce\x8e" => "\xcf\x8d",
  729. "\xce\x8f" => "\xcf\x8e",
  730. "\xce\x91" => "\xce\xb1",
  731. "\xce\x92" => "\xce\xb2",
  732. "\xce\x93" => "\xce\xb3",
  733. "\xce\x94" => "\xce\xb4",
  734. "\xce\x95" => "\xce\xb5",
  735. "\xce\x96" => "\xce\xb6",
  736. "\xce\x97" => "\xce\xb7",
  737. "\xce\x98" => "\xce\xb8",
  738. "\xce\x99" => "\xce\xb9",
  739. "\xce\x9a" => "\xce\xba",
  740. "\xce\x9b" => "\xce\xbb",
  741. "\xce\x9c" => "\xc2\xb5",
  742. "\xce\x9d" => "\xce\xbd",
  743. "\xce\x9e" => "\xce\xbe",
  744. "\xce\x9f" => "\xce\xbf",
  745. "\xce\xa0" => "\xcf\x80",
  746. "\xce\xa1" => "\xcf\x81",
  747. "\xce\xa3" => "\xcf\x82",
  748. "\xce\xa4" => "\xcf\x84",
  749. "\xce\xa5" => "\xcf\x85",
  750. "\xce\xa6" => "\xcf\x86",
  751. "\xce\xa7" => "\xcf\x87",
  752. "\xce\xa8" => "\xcf\x88",
  753. "\xce\xa9" => "\xcf\x89",
  754. "\xce\xaa" => "\xcf\x8a",
  755. "\xce\xab" => "\xcf\x8b",
  756. "\xcf\x98" => "\xcf\x99",
  757. "\xcf\x9a" => "\xcf\x9b",
  758. "\xcf\x9c" => "\xcf\x9d",
  759. "\xcf\x9e" => "\xcf\x9f",
  760. "\xcf\xa0" => "\xcf\xa1",
  761. "\xcf\xa2" => "\xcf\xa3",
  762. "\xcf\xa4" => "\xcf\xa5",
  763. "\xcf\xa6" => "\xcf\xa7",
  764. "\xcf\xa8" => "\xcf\xa9",
  765. "\xcf\xaa" => "\xcf\xab",
  766. "\xcf\xac" => "\xcf\xad",
  767. "\xcf\xae" => "\xcf\xaf",
  768. "\xd0\x80" => "\xd1\x90",
  769. "\xd0\x81" => "\xd1\x91",
  770. "\xd0\x82" => "\xd1\x92",
  771. "\xd0\x83" => "\xd1\x93",
  772. "\xd0\x84" => "\xd1\x94",
  773. "\xd0\x85" => "\xd1\x95",
  774. "\xd0\x86" => "\xd1\x96",
  775. "\xd0\x87" => "\xd1\x97",
  776. "\xd0\x88" => "\xd1\x98",
  777. "\xd0\x89" => "\xd1\x99",
  778. "\xd0\x8a" => "\xd1\x9a",
  779. "\xd0\x8b" => "\xd1\x9b",
  780. "\xd0\x8c" => "\xd1\x9c",
  781. "\xd0\x8d" => "\xd1\x9d",
  782. "\xd0\x8e" => "\xd1\x9e",
  783. "\xd0\x8f" => "\xd1\x9f",
  784. "\xd0\x90" => "\xd0\xb0",
  785. "\xd0\x91" => "\xd0\xb1",
  786. "\xd0\x92" => "\xd0\xb2",
  787. "\xd0\x93" => "\xd0\xb3",
  788. "\xd0\x94" => "\xd0\xb4",
  789. "\xd0\x95" => "\xd0\xb5",
  790. "\xd0\x96" => "\xd0\xb6",
  791. "\xd0\x97" => "\xd0\xb7",
  792. "\xd0\x98" => "\xd0\xb8",
  793. "\xd0\x99" => "\xd0\xb9",
  794. "\xd0\x9a" => "\xd0\xba",
  795. "\xd0\x9b" => "\xd0\xbb",
  796. "\xd0\x9c" => "\xd0\xbc",
  797. "\xd0\x9d" => "\xd0\xbd",
  798. "\xd0\x9e" => "\xd0\xbe",
  799. "\xd0\x9f" => "\xd0\xbf",
  800. "\xd0\xa0" => "\xd1\x80",
  801. "\xd0\xa1" => "\xd1\x81",
  802. "\xd0\xa2" => "\xd1\x82",
  803. "\xd0\xa3" => "\xd1\x83",
  804. "\xd0\xa4" => "\xd1\x84",
  805. "\xd0\xa5" => "\xd1\x85",
  806. "\xd0\xa6" => "\xd1\x86",
  807. "\xd0\xa7" => "\xd1\x87",
  808. "\xd0\xa8" => "\xd1\x88",
  809. "\xd0\xa9" => "\xd1\x89",
  810. "\xd0\xaa" => "\xd1\x8a",
  811. "\xd0\xab" => "\xd1\x8b",
  812. "\xd0\xac" => "\xd1\x8c",
  813. "\xd0\xad" => "\xd1\x8d",
  814. "\xd0\xae" => "\xd1\x8e",
  815. "\xd0\xaf" => "\xd1\x8f",
  816. "\xd1\xa0" => "\xd1\xa1",
  817. "\xd1\xa2" => "\xd1\xa3",
  818. "\xd1\xa4" => "\xd1\xa5",
  819. "\xd1\xa6" => "\xd1\xa7",
  820. "\xd1\xa8" => "\xd1\xa9",
  821. "\xd1\xaa" => "\xd1\xab",
  822. "\xd1\xac" => "\xd1\xad",
  823. "\xd1\xae" => "\xd1\xaf",
  824. "\xd1\xb0" => "\xd1\xb1",
  825. "\xd1\xb2" => "\xd1\xb3",
  826. "\xd1\xb4" => "\xd1\xb5",
  827. "\xd1\xb6" => "\xd1\xb7",
  828. "\xd1\xb8" => "\xd1\xb9",
  829. "\xd1\xba" => "\xd1\xbb",
  830. "\xd1\xbc" => "\xd1\xbd",
  831. "\xd1\xbe" => "\xd1\xbf",
  832. "\xd2\x80" => "\xd2\x81",
  833. "\xd2\x8a" => "\xd2\x8b",
  834. "\xd2\x8c" => "\xd2\x8d",
  835. "\xd2\x8e" => "\xd2\x8f",
  836. "\xd2\x90" => "\xd2\x91",
  837. "\xd2\x92" => "\xd2\x93",
  838. "\xd2\x94" => "\xd2\x95",
  839. "\xd2\x96" => "\xd2\x97",
  840. "\xd2\x98" => "\xd2\x99",
  841. "\xd2\x9a" => "\xd2\x9b",
  842. "\xd2\x9c" => "\xd2\x9d",
  843. "\xd2\x9e" => "\xd2\x9f",
  844. "\xd2\xa0" => "\xd2\xa1",
  845. "\xd2\xa2" => "\xd2\xa3",
  846. "\xd2\xa4" => "\xd2\xa5",
  847. "\xd2\xa6" => "\xd2\xa7",
  848. "\xd2\xa8" => "\xd2\xa9",
  849. "\xd2\xaa" => "\xd2\xab",
  850. "\xd2\xac" => "\xd2\xad",
  851. "\xd2\xae" => "\xd2\xaf",
  852. "\xd2\xb0" => "\xd2\xb1",
  853. "\xd2\xb2" => "\xd2\xb3",
  854. "\xd2\xb4" => "\xd2\xb5",
  855. "\xd2\xb6" => "\xd2\xb7",
  856. "\xd2\xb8" => "\xd2\xb9",
  857. "\xd2\xba" => "\xd2\xbb",
  858. "\xd2\xbc" => "\xd2\xbd",
  859. "\xd2\xbe" => "\xd2\xbf",
  860. "\xd3\x81" => "\xd3\x82",
  861. "\xd3\x83" => "\xd3\x84",
  862. "\xd3\x85" => "\xd3\x86",
  863. "\xd3\x87" => "\xd3\x88",
  864. "\xd3\x89" => "\xd3\x8a",
  865. "\xd3\x8b" => "\xd3\x8c",
  866. "\xd3\x8d" => "\xd3\x8e",
  867. "\xd3\x90" => "\xd3\x91",
  868. "\xd3\x92" => "\xd3\x93",
  869. "\xd3\x94" => "\xd3\x95",
  870. "\xd3\x96" => "\xd3\x97",
  871. "\xd3\x98" => "\xd3\x99",
  872. "\xd3\x9a" => "\xd3\x9b",
  873. "\xd3\x9c" => "\xd3\x9d",
  874. "\xd3\x9e" => "\xd3\x9f",
  875. "\xd3\xa0" => "\xd3\xa1",
  876. "\xd3\xa2" => "\xd3\xa3",
  877. "\xd3\xa4" => "\xd3\xa5",
  878. "\xd3\xa6" => "\xd3\xa7",
  879. "\xd3\xa8" => "\xd3\xa9",
  880. "\xd3\xaa" => "\xd3\xab",
  881. "\xd3\xac" => "\xd3\xad",
  882. "\xd3\xae" => "\xd3\xaf",
  883. "\xd3\xb0" => "\xd3\xb1",
  884. "\xd3\xb2" => "\xd3\xb3",
  885. "\xd3\xb4" => "\xd3\xb5",
  886. "\xd3\xb8" => "\xd3\xb9",
  887. "\xd4\x80" => "\xd4\x81",
  888. "\xd4\x82" => "\xd4\x83",
  889. "\xd4\x84" => "\xd4\x85",
  890. "\xd4\x86" => "\xd4\x87",
  891. "\xd4\x88" => "\xd4\x89",
  892. "\xd4\x8a" => "\xd4\x8b",
  893. "\xd4\x8c" => "\xd4\x8d",
  894. "\xd4\x8e" => "\xd4\x8f",
  895. "\xd4\xb1" => "\xd5\xa1",
  896. "\xd4\xb2" => "\xd5\xa2",
  897. "\xd4\xb3" => "\xd5\xa3",
  898. "\xd4\xb4" => "\xd5\xa4",
  899. "\xd4\xb5" => "\xd5\xa5",
  900. "\xd4\xb6" => "\xd5\xa6",
  901. "\xd4\xb7" => "\xd5\xa7",
  902. "\xd4\xb8" => "\xd5\xa8",
  903. "\xd4\xb9" => "\xd5\xa9",
  904. "\xd4\xba" => "\xd5\xaa",
  905. "\xd4\xbb" => "\xd5\xab",
  906. "\xd4\xbc" => "\xd5\xac",
  907. "\xd4\xbd" => "\xd5\xad",
  908. "\xd4\xbe" => "\xd5\xae",
  909. "\xd4\xbf" => "\xd5\xaf",
  910. "\xd5\x80" => "\xd5\xb0",
  911. "\xd5\x81" => "\xd5\xb1",
  912. "\xd5\x82" => "\xd5\xb2",
  913. "\xd5\x83" => "\xd5\xb3",
  914. "\xd5\x84" => "\xd5\xb4",
  915. "\xd5\x85" => "\xd5\xb5",
  916. "\xd5\x86" => "\xd5\xb6",
  917. "\xd5\x87" => "\xd5\xb7",
  918. "\xd5\x88" => "\xd5\xb8",
  919. "\xd5\x89" => "\xd5\xb9",
  920. "\xd5\x8a" => "\xd5\xba",
  921. "\xd5\x8b" => "\xd5\xbb",
  922. "\xd5\x8c" => "\xd5\xbc",
  923. "\xd5\x8d" => "\xd5\xbd",
  924. "\xd5\x8e" => "\xd5\xbe",
  925. "\xd5\x8f" => "\xd5\xbf",
  926. "\xd5\x90" => "\xd6\x80",
  927. "\xd5\x91" => "\xd6\x81",
  928. "\xd5\x92" => "\xd6\x82",
  929. "\xd5\x93" => "\xd6\x83",
  930. "\xd5\x94" => "\xd6\x84",
  931. "\xd5\x95" => "\xd6\x85",
  932. "\xd5\x96" => "\xd6\x86",
  933. "\xe1\xb8\x80" => "\xe1\xb8\x81",
  934. "\xe1\xb8\x82" => "\xe1\xb8\x83",
  935. "\xe1\xb8\x84" => "\xe1\xb8\x85",
  936. "\xe1\xb8\x86" => "\xe1\xb8\x87",
  937. "\xe1\xb8\x88" => "\xe1\xb8\x89",
  938. "\xe1\xb8\x8a" => "\xe1\xb8\x8b",
  939. "\xe1\xb8\x8c" => "\xe1\xb8\x8d",
  940. "\xe1\xb8\x8e" => "\xe1\xb8\x8f",
  941. "\xe1\xb8\x90" => "\xe1\xb8\x91",
  942. "\xe1\xb8\x92" => "\xe1\xb8\x93",
  943. "\xe1\xb8\x94" => "\xe1\xb8\x95",
  944. "\xe1\xb8\x96" => "\xe1\xb8\x97",
  945. "\xe1\xb8\x98" => "\xe1\xb8\x99",
  946. "\xe1\xb8\x9a" => "\xe1\xb8\x9b",
  947. "\xe1\xb8\x9c" => "\xe1\xb8\x9d",
  948. "\xe1\xb8\x9e" => "\xe1\xb8\x9f",
  949. "\xe1\xb8\xa0" => "\xe1\xb8\xa1",
  950. "\xe1\xb8\xa2" => "\xe1\xb8\xa3",
  951. "\xe1\xb8\xa4" => "\xe1\xb8\xa5",
  952. "\xe1\xb8\xa6" => "\xe1\xb8\xa7",
  953. "\xe1\xb8\xa8" => "\xe1\xb8\xa9",
  954. "\xe1\xb8\xaa" => "\xe1\xb8\xab",
  955. "\xe1\xb8\xac" => "\xe1\xb8\xad",
  956. "\xe1\xb8\xae" => "\xe1\xb8\xaf",
  957. "\xe1\xb8\xb0" => "\xe1\xb8\xb1",
  958. "\xe1\xb8\xb2" => "\xe1\xb8\xb3",
  959. "\xe1\xb8\xb4" => "\xe1\xb8\xb5",
  960. "\xe1\xb8\xb6" => "\xe1\xb8\xb7",
  961. "\xe1\xb8\xb8" => "\xe1\xb8\xb9",
  962. "\xe1\xb8\xba" => "\xe1\xb8\xbb",
  963. "\xe1\xb8\xbc" => "\xe1\xb8\xbd",
  964. "\xe1\xb8\xbe" => "\xe1\xb8\xbf",
  965. "\xe1\xb9\x80" => "\xe1\xb9\x81",
  966. "\xe1\xb9\x82" => "\xe1\xb9\x83",
  967. "\xe1\xb9\x84" => "\xe1\xb9\x85",
  968. "\xe1\xb9\x86" => "\xe1\xb9\x87",
  969. "\xe1\xb9\x88" => "\xe1\xb9\x89",
  970. "\xe1\xb9\x8a" => "\xe1\xb9\x8b",
  971. "\xe1\xb9\x8c" => "\xe1\xb9\x8d",
  972. "\xe1\xb9\x8e" => "\xe1\xb9\x8f",
  973. "\xe1\xb9\x90" => "\xe1\xb9\x91",
  974. "\xe1\xb9\x92" => "\xe1\xb9\x93",
  975. "\xe1\xb9\x94" => "\xe1\xb9\x95",
  976. "\xe1\xb9\x96" => "\xe1\xb9\x97",
  977. "\xe1\xb9\x98" => "\xe1\xb9\x99",
  978. "\xe1\xb9\x9a" => "\xe1\xb9\x9b",
  979. "\xe1\xb9\x9c" => "\xe1\xb9\x9d",
  980. "\xe1\xb9\x9e" => "\xe1\xb9\x9f",
  981. "\xe1\xb9\xa0" => "\xe1\xb9\xa1",
  982. "\xe1\xb9\xa2" => "\xe1\xb9\xa3",
  983. "\xe1\xb9\xa4" => "\xe1\xb9\xa5",
  984. "\xe1\xb9\xa6" => "\xe1\xb9\xa7",
  985. "\xe1\xb9\xa8" => "\xe1\xb9\xa9",
  986. "\xe1\xb9\xaa" => "\xe1\xb9\xab",
  987. "\xe1\xb9\xac" => "\xe1\xb9\xad",
  988. "\xe1\xb9\xae" => "\xe1\xb9\xaf",
  989. "\xe1\xb9\xb0" => "\xe1\xb9\xb1",
  990. "\xe1\xb9\xb2" => "\xe1\xb9\xb3",
  991. "\xe1\xb9\xb4" => "\xe1\xb9\xb5",
  992. "\xe1\xb9\xb6" => "\xe1\xb9\xb7",
  993. "\xe1\xb9\xb8" => "\xe1\xb9\xb9",
  994. "\xe1\xb9\xba" => "\xe1\xb9\xbb",
  995. "\xe1\xb9\xbc" => "\xe1\xb9\xbd",
  996. "\xe1\xb9\xbe" => "\xe1\xb9\xbf",
  997. "\xe1\xba\x80" => "\xe1\xba\x81",
  998. "\xe1\xba\x82" => "\xe1\xba\x83",
  999. "\xe1\xba\x84" => "\xe1\xba\x85",
  1000. "\xe1\xba\x86" => "\xe1\xba\x87",
  1001. "\xe1\xba\x88" => "\xe1\xba\x89",
  1002. "\xe1\xba\x8a" => "\xe1\xba\x8b",
  1003. "\xe1\xba\x8c" => "\xe1\xba\x8d",
  1004. "\xe1\xba\x8e" => "\xe1\xba\x8f",
  1005. "\xe1\xba\x90" => "\xe1\xba\x91",
  1006. "\xe1\xba\x92" => "\xe1\xba\x93",
  1007. "\xe1\xba\x94" => "\xe1\xba\x95",
  1008. "\xe1\xba\xa0" => "\xe1\xba\xa1",
  1009. "\xe1\xba\xa2" => "\xe1\xba\xa3",
  1010. "\xe1\xba\xa4" => "\xe1\xba\xa5",
  1011. "\xe1\xba\xa6" => "\xe1\xba\xa7",
  1012. "\xe1\xba\xa8" => "\xe1\xba\xa9",
  1013. "\xe1\xba\xaa" => "\xe1\xba\xab",
  1014. "\xe1\xba\xac" => "\xe1\xba\xad",
  1015. "\xe1\xba\xae" => "\xe1\xba\xaf",
  1016. "\xe1\xba\xb0" => "\xe1\xba\xb1",
  1017. "\xe1\xba\xb2" => "\xe1\xba\xb3",
  1018. "\xe1\xba\xb4" => "\xe1\xba\xb5",
  1019. "\xe1\xba\xb6" => "\xe1\xba\xb7",
  1020. "\xe1\xba\xb8" => "\xe1\xba\xb9",
  1021. "\xe1\xba\xba" => "\xe1\xba\xbb",
  1022. "\xe1\xba\xbc" => "\xe1\xba\xbd",
  1023. "\xe1\xba\xbe" => "\xe1\xba\xbf",
  1024. "\xe1\xbb\x80" => "\xe1\xbb\x81",
  1025. "\xe1\xbb\x82" => "\xe1\xbb\x83",
  1026. "\xe1\xbb\x84" => "\xe1\xbb\x85",
  1027. "\xe1\xbb\x86" => "\xe1\xbb\x87",
  1028. "\xe1\xbb\x88" => "\xe1\xbb\x89",
  1029. "\xe1\xbb\x8a" => "\xe1\xbb\x8b",
  1030. "\xe1\xbb\x8c" => "\xe1\xbb\x8d",
  1031. "\xe1\xbb\x8e" => "\xe1\xbb\x8f",
  1032. "\xe1\xbb\x90" => "\xe1\xbb\x91",
  1033. "\xe1\xbb\x92" => "\xe1\xbb\x93",
  1034. "\xe1\xbb\x94" => "\xe1\xbb\x95",
  1035. "\xe1\xbb\x96" => "\xe1\xbb\x97",
  1036. "\xe1\xbb\x98" => "\xe1\xbb\x99",
  1037. "\xe1\xbb\x9a" => "\xe1\xbb\x9b",
  1038. "\xe1\xbb\x9c" => "\xe1\xbb\x9d",
  1039. "\xe1\xbb\x9e" => "\xe1\xbb\x9f",
  1040. "\xe1\xbb\xa0" => "\xe1\xbb\xa1",
  1041. "\xe1\xbb\xa2" => "\xe1\xbb\xa3",
  1042. "\xe1\xbb\xa4" => "\xe1\xbb\xa5",
  1043. "\xe1\xbb\xa6" => "\xe1\xbb\xa7",
  1044. "\xe1\xbb\xa8" => "\xe1\xbb\xa9",
  1045. "\xe1\xbb\xaa" => "\xe1\xbb\xab",
  1046. "\xe1\xbb\xac" => "\xe1\xbb\xad",
  1047. "\xe1\xbb\xae" => "\xe1\xbb\xaf",
  1048. "\xe1\xbb\xb0" => "\xe1\xbb\xb1",
  1049. "\xe1\xbb\xb2" => "\xe1\xbb\xb3",
  1050. "\xe1\xbb\xb4" => "\xe1\xbb\xb5",
  1051. "\xe1\xbb\xb6" => "\xe1\xbb\xb7",
  1052. "\xe1\xbb\xb8" => "\xe1\xbb\xb9",
  1053. "\xe1\xbc\x88" => "\xe1\xbc\x80",
  1054. "\xe1\xbc\x89" => "\xe1\xbc\x81",
  1055. "\xe1\xbc\x8a" => "\xe1\xbc\x82",
  1056. "\xe1\xbc\x8b" => "\xe1\xbc\x83",
  1057. "\xe1\xbc\x8c" => "\xe1\xbc\x84",
  1058. "\xe1\xbc\x8d" => "\xe1\xbc\x85",
  1059. "\xe1\xbc\x8e" => "\xe1\xbc\x86",
  1060. "\xe1\xbc\x8f" => "\xe1\xbc\x87",
  1061. "\xe1\xbc\x98" => "\xe1\xbc\x90",
  1062. "\xe1\xbc\x99" => "\xe1\xbc\x91",
  1063. "\xe1\xbc\x9a" => "\xe1\xbc\x92",
  1064. "\xe1\xbc\x9b" => "\xe1\xbc\x93",
  1065. "\xe1\xbc\x9c" => "\xe1\xbc\x94",
  1066. "\xe1\xbc\x9d" => "\xe1\xbc\x95",
  1067. "\xe1\xbc\xa9" => "\xe1\xbc\xa1",
  1068. "\xe1\xbc\xaa" => "\xe1\xbc\xa2",
  1069. "\xe1\xbc\xab" => "\xe1\xbc\xa3",
  1070. "\xe1\xbc\xac" => "\xe1\xbc\xa4",
  1071. "\xe1\xbc\xad" => "\xe1\xbc\xa5",
  1072. "\xe1\xbc\xae" => "\xe1\xbc\xa6",
  1073. "\xe1\xbc\xaf" => "\xe1\xbc\xa7",
  1074. "\xe1\xbc\xb8" => "\xe1\xbc\xb0",
  1075. "\xe1\xbc\xb9" => "\xe1\xbc\xb1",
  1076. "\xe1\xbc\xba" => "\xe1\xbc\xb2",
  1077. "\xe1\xbc\xbb" => "\xe1\xbc\xb3",
  1078. "\xe1\xbc\xbc" => "\xe1\xbc\xb4",
  1079. "\xe1\xbc\xbd" => "\xe1\xbc\xb5",
  1080. "\xe1\xbc\xbe" => "\xe1\xbc\xb6",
  1081. "\xe1\xbc\xbf" => "\xe1\xbc\xb7",
  1082. "\xe1\xbd\x88" => "\xe1\xbd\x80",
  1083. "\xe1\xbd\x89" => "\xe1\xbd\x81",
  1084. "\xe1\xbd\x8a" => "\xe1\xbd\x82",
  1085. "\xe1\xbd\x8b" => "\xe1\xbd\x83",
  1086. "\xe1\xbd\x8c" => "\xe1\xbd\x84",
  1087. "\xe1\xbd\x8d" => "\xe1\xbd\x85",
  1088. "\xe1\xbd\x99" => "\xe1\xbd\x91",
  1089. "\xe1\xbd\x9b" => "\xe1\xbd\x93",
  1090. "\xe1\xbd\x9d" => "\xe1\xbd\x95",
  1091. "\xe1\xbd\x9f" => "\xe1\xbd\x97",
  1092. "\xe1\xbd\xa9" => "\xe1\xbd\xa1",
  1093. "\xe1\xbd\xaa" => "\xe1\xbd\xa2",
  1094. "\xe1\xbd\xab" => "\xe1\xbd\xa3",
  1095. "\xe1\xbd\xac" => "\xe1\xbd\xa4",
  1096. "\xe1\xbd\xad" => "\xe1\xbd\xa5",
  1097. "\xe1\xbd\xae" => "\xe1\xbd\xa6",
  1098. "\xe1\xbd\xaf" => "\xe1\xbd\xa7",
  1099. "\xe1\xbe\x88" => "\xe1\xbe\x80",
  1100. "\xe1\xbe\x89" => "\xe1\xbe\x81",
  1101. "\xe1\xbe\x8a" => "\xe1\xbe\x82",
  1102. "\xe1\xbe\x8b" => "\xe1\xbe\x83",
  1103. "\xe1\xbe\x8c" => "\xe1\xbe\x84",
  1104. "\xe1\xbe\x8d" => "\xe1\xbe\x85",
  1105. "\xe1\xbe\x8e" => "\xe1\xbe\x86",
  1106. "\xe1\xbe\x8f" => "\xe1\xbe\x87",
  1107. "\xe1\xbe\x98" => "\xe1\xbe\x90",
  1108. "\xe1\xbe\x99" => "\xe1\xbe\x91",
  1109. "\xe1\xbe\x9a" => "\xe1\xbe\x92",
  1110. "\xe1\xbe\x9b" => "\xe1\xbe\x93",
  1111. "\xe1\xbe\x9c" => "\xe1\xbe\x94",
  1112. "\xe1\xbe\x9d" => "\xe1\xbe\x95",
  1113. "\xe1\xbe\x9e" => "\xe1\xbe\x96",
  1114. "\xe1\xbe\x9f" => "\xe1\xbe\x97",
  1115. "\xe1\xbe\xa9" => "\xe1\xbe\xa1",
  1116. "\xe1\xbe\xaa" => "\xe1\xbe\xa2",
  1117. "\xe1\xbe\xab" => "\xe1\xbe\xa3",
  1118. "\xe1\xbe\xac" => "\xe1\xbe\xa4",
  1119. "\xe1\xbe\xad" => "\xe1\xbe\xa5",
  1120. "\xe1\xbe\xae" => "\xe1\xbe\xa6",
  1121. "\xe1\xbe\xaf" => "\xe1\xbe\xa7",
  1122. "\xe1\xbe\xb8" => "\xe1\xbe\xb0",
  1123. "\xe1\xbe\xb9" => "\xe1\xbe\xb1",
  1124. "\xe1\xbe\xba" => "\xe1\xbd\xb0",
  1125. "\xe1\xbe\xbb" => "\xe1\xbd\xb1",
  1126. "\xe1\xbe\xbc" => "\xe1\xbe\xb3",
  1127. "\xe1\xbf\x88" => "\xe1\xbd\xb2",
  1128. "\xe1\xbf\x89" => "\xe1\xbd\xb3",
  1129. "\xe1\xbf\x8a" => "\xe1\xbd\xb4",
  1130. "\xe1\xbf\x8b" => "\xe1\xbd\xb5",
  1131. "\xe1\xbf\x8c" => "\xe1\xbf\x83",
  1132. "\xe1\xbf\x98" => "\xe1\xbf\x90",
  1133. "\xe1\xbf\x99" => "\xe1\xbf\x91",
  1134. "\xe1\xbf\x9a" => "\xe1\xbd\xb6",
  1135. "\xe1\xbf\x9b" => "\xe1\xbd\xb7",
  1136. "\xe1\xbf\xa9" => "\xe1\xbf\xa1",
  1137. "\xe1\xbf\xaa" => "\xe1\xbd\xba",
  1138. "\xe1\xbf\xab" => "\xe1\xbd\xbb",
  1139. "\xe1\xbf\xac" => "\xe1\xbf\xa5",
  1140. "\xe1\xbf\xb8" => "\xe1\xbd\xb8",
  1141. "\xe1\xbf\xb9" => "\xe1\xbd\xb9",
  1142. "\xe1\xbf\xba" => "\xe1\xbd\xbc",
  1143. "\xe1\xbf\xbb" => "\xe1\xbd\xbd",
  1144. "\xe1\xbf\xbc" => "\xe1\xbf\xb3",
  1145. "\xef\xbc\xa1" => "\xef\xbd\x81",
  1146. "\xef\xbc\xa2" => "\xef\xbd\x82",
  1147. "\xef\xbc\xa3" => "\xef\xbd\x83",
  1148. "\xef\xbc\xa4" => "\xef\xbd\x84",
  1149. "\xef\xbc\xa5" => "\xef\xbd\x85",
  1150. "\xef\xbc\xa6" => "\xef\xbd\x86",
  1151. "\xef\xbc\xa7" => "\xef\xbd\x87",
  1152. "\xef\xbc\xa8" => "\xef\xbd\x88",
  1153. "\xef\xbc\xa9" => "\xef\xbd\x89",
  1154. "\xef\xbc\xaa" => "\xef\xbd\x8a",
  1155. "\xef\xbc\xab" => "\xef\xbd\x8b",
  1156. "\xef\xbc\xac" => "\xef\xbd\x8c",
  1157. "\xef\xbc\xad" => "\xef\xbd\x8d",
  1158. "\xef\xbc\xae" => "\xef\xbd\x8e",
  1159. "\xef\xbc\xaf" => "\xef\xbd\x8f",
  1160. "\xef\xbc\xb0" => "\xef\xbd\x90",
  1161. "\xef\xbc\xb1" => "\xef\xbd\x91",
  1162. "\xef\xbc\xb2" => "\xef\xbd\x92",
  1163. "\xef\xbc\xb3" => "\xef\xbd\x93",
  1164. "\xef\xbc\xb4" => "\xef\xbd\x94",
  1165. "\xef\xbc\xb5" => "\xef\xbd\x95",
  1166. "\xef\xbc\xb6" => "\xef\xbd\x96",
  1167. "\xef\xbc\xb7" => "\xef\xbd\x97",
  1168. "\xef\xbc\xb8" => "\xef\xbd\x98",
  1169. "\xef\xbc\xb9" => "\xef\xbd\x99",
  1170. "\xef\xbc\xba" => "\xef\xbd\x9a",
  1171. );
  1172. #Unicode Character Database 6.0.0 (2010-06-04)
  1173. #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total
  1174. public static $unicode_blocks = array(
  1175. 'Basic Latin' => array(
  1176. 0 => 0x0000,
  1177. 1 => 0x007F,
  1178. 2 => 0,
  1179. ),
  1180. 'Latin-1 Supplement' => array(
  1181. 0 => 0x0080,
  1182. 1 => 0x00FF,
  1183. 2 => 1,
  1184. ),
  1185. 'Latin Extended-A' => array(
  1186. 0 => 0x0100,
  1187. 1 => 0x017F,
  1188. 2 => 2,
  1189. ),
  1190. 'Latin Extended-B' => array(
  1191. 0 => 0x0180,
  1192. 1 => 0x024F,
  1193. 2 => 3,
  1194. ),
  1195. 'IPA Extensions' => array(
  1196. 0 => 0x0250,
  1197. 1 => 0x02AF,
  1198. 2 => 4,
  1199. ),
  1200. 'Spacing Modifier Letters' => array(
  1201. 0 => 0x02B0,
  1202. 1 => 0x02FF,
  1203. 2 => 5,
  1204. ),
  1205. 'Combining Diacritical Marks' => array(
  1206. 0 => 0x0300,
  1207. 1 => 0x036F,
  1208. 2 => 6,
  1209. ),
  1210. 'Greek and Coptic' => array(
  1211. 0 => 0x0370,
  1212. 1 => 0x03FF,
  1213. 2 => 7,
  1214. ),
  1215. 'Cyrillic' => array(
  1216. 0 => 0x0400,
  1217. 1 => 0x04FF,
  1218. 2 => 8,
  1219. ),
  1220. 'Cyrillic Supplement' => array(
  1221. 0 => 0x0500,
  1222. 1 => 0x052F,
  1223. 2 => 9,
  1224. ),
  1225. 'Armenian' => array(
  1226. 0 => 0x0530,
  1227. 1 => 0x058F,
  1228. 2 => 10,
  1229. ),
  1230. 'Hebrew' => array(
  1231. 0 => 0x0590,
  1232. 1 => 0x05FF,
  1233. 2 => 11,
  1234. ),
  1235. 'Arabic' => array(
  1236. 0 => 0x0600,
  1237. 1 => 0x06FF,
  1238. 2 => 12,
  1239. ),
  1240. 'Syriac' => array(
  1241. 0 => 0x0700,
  1242. 1 => 0x074F,
  1243. 2 => 13,
  1244. ),
  1245. 'Arabic Supplement' => array(
  1246. 0 => 0x0750,
  1247. 1 => 0x077F,
  1248. 2 => 14,
  1249. ),
  1250. 'Thaana' => array(
  1251. 0 => 0x0780,
  1252. 1 => 0x07BF,
  1253. 2 => 15,
  1254. ),
  1255. 'NKo' => array(
  1256. 0 => 0x07C0,
  1257. 1 => 0x07FF,
  1258. 2 => 16,
  1259. ),
  1260. 'Samaritan' => array(
  1261. 0 => 0x0800,
  1262. 1 => 0x083F,
  1263. 2 => 17,
  1264. ),
  1265. 'Mandaic' => array(
  1266. 0 => 0x0840,
  1267. 1 => 0x085F,
  1268. 2 => 18,
  1269. ),
  1270. 'Devanagari' => array(
  1271. 0 => 0x0900,
  1272. 1 => 0x097F,
  1273. 2 => 19,
  1274. ),
  1275. 'Bengali' => array(
  1276. 0 => 0x0980,
  1277. 1 => 0x09FF,
  1278. 2 => 20,
  1279. ),
  1280. 'Gurmukhi' => array(
  1281. 0 => 0x0A00,
  1282. 1 => 0x0A7F,
  1283. 2 => 21,
  1284. ),
  1285. 'Gujarati' => array(
  1286. 0 => 0x0A80,
  1287. 1 => 0x0AFF,
  1288. 2 => 22,
  1289. ),
  1290. 'Oriya' => array(
  1291. 0 => 0x0B00,
  1292. 1 => 0x0B7F,
  1293. 2 => 23,
  1294. ),
  1295. 'Tamil' => array(
  1296. 0 => 0x0B80,
  1297. 1 => 0x0BFF,
  1298. 2 => 24,
  1299. ),
  1300. 'Telugu' => array(
  1301. 0 => 0x0C00,
  1302. 1 => 0x0C7F,
  1303. 2 => 25,
  1304. ),
  1305. 'Kannada' => array(
  1306. 0 => 0x0C80,
  1307. 1 => 0x0CFF,
  1308. 2 => 26,
  1309. ),
  1310. 'Malayalam' => array(
  1311. 0 => 0x0D00,
  1312. 1 => 0x0D7F,
  1313. 2 => 27,
  1314. ),
  1315. 'Sinhala' => array(
  1316. 0 => 0x0D80,
  1317. 1 => 0x0DFF,
  1318. 2 => 28,
  1319. ),
  1320. 'Thai' => array(
  1321. 0 => 0x0E00,
  1322. 1 => 0x0E7F,
  1323. 2 => 29,
  1324. ),
  1325. 'Lao' => array(
  1326. 0 => 0x0E80,
  1327. 1 => 0x0EFF,
  1328. 2 => 30,
  1329. ),
  1330. 'Tibetan' => array(
  1331. 0 => 0x0F00,
  1332. 1 => 0x0FFF,
  1333. 2 => 31,
  1334. ),
  1335. 'Myanmar' => array(
  1336. 0 => 0x1000,
  1337. 1 => 0x109F,
  1338. 2 => 32,
  1339. ),
  1340. 'Georgian' => array(
  1341. 0 => 0x10A0,
  1342. 1 => 0x10FF,
  1343. 2 => 33,
  1344. ),
  1345. 'Hangul Jamo' => array(
  1346. 0 => 0x1100,
  1347. 1 => 0x11FF,
  1348. 2 => 34,
  1349. ),
  1350. 'Ethiopic' => array(
  1351. 0 => 0x1200,
  1352. 1 => 0x137F,
  1353. 2 => 35,
  1354. ),
  1355. 'Ethiopic Supplement' => array(
  1356. 0 => 0x1380,
  1357. 1 => 0x139F,
  1358. 2 => 36,
  1359. ),
  1360. 'Cherokee' => array(
  1361. 0 => 0x13A0,
  1362. 1 => 0x13FF,
  1363. 2 => 37,
  1364. ),
  1365. 'Unified Canadian Aboriginal Syllabics' => array(
  1366. 0 => 0x1400,
  1367. 1 => 0x167F,
  1368. 2 => 38,
  1369. ),
  1370. 'Ogham' => array(
  1371. 0 => 0x1680,
  1372. 1 => 0x169F,
  1373. 2 => 39,
  1374. ),
  1375. 'Runic' => array(
  1376. 0 => 0x16A0,
  1377. 1 => 0x16FF,
  1378. 2 => 40,
  1379. ),
  1380. 'Tagalog' => array(
  1381. 0 => 0x1700,
  1382. 1 => 0x171F,
  1383. 2 => 41,
  1384. ),
  1385. 'Hanunoo' => array(
  1386. 0 => 0x1720,
  1387. 1 => 0x173F,
  1388. 2 => 42,
  1389. ),
  1390. 'Buhid' => array(
  1391. 0 => 0x1740,
  1392. 1 => 0x175F,
  1393. 2 => 43,
  1394. ),
  1395. 'Tagbanwa' => array(
  1396. 0 => 0x1760,
  1397. 1 => 0x177F,
  1398. 2 => 44,
  1399. ),
  1400. 'Khmer' => array(
  1401. 0 => 0x1780,
  1402. 1 => 0x17FF,
  1403. 2 => 45,
  1404. ),
  1405. 'Mongolian' => array(
  1406. 0 => 0x1800,
  1407. 1 => 0x18AF,
  1408. 2 => 46,
  1409. ),
  1410. 'Unified Canadian Aboriginal Syllabics Extended' => array(
  1411. 0 => 0x18B0,
  1412. 1 => 0x18FF,
  1413. 2 => 47,
  1414. ),
  1415. 'Limbu' => array(
  1416. 0 => 0x1900,
  1417. 1 => 0x194F,
  1418. 2 => 48,
  1419. ),
  1420. 'Tai Le' => array(
  1421. 0 => 0x1950,
  1422. 1 => 0x197F,
  1423. 2 => 49,
  1424. ),
  1425. 'New Tai Lue' => array(
  1426. 0 => 0x1980,
  1427. 1 => 0x19DF,
  1428. 2 => 50,
  1429. ),
  1430. 'Khmer Symbols' => array(
  1431. 0 => 0x19E0,
  1432. 1 => 0x19FF,
  1433. 2 => 51,
  1434. ),
  1435. 'Buginese' => array(
  1436. 0 => 0x1A00,
  1437. 1 => 0x1A1F,
  1438. 2 => 52,
  1439. ),
  1440. 'Tai Tham' => array(
  1441. 0 => 0x1A20,
  1442. 1 => 0x1AAF,
  1443. 2 => 53,
  1444. ),
  1445. 'Balinese' => array(
  1446. 0 => 0x1B00,
  1447. 1 => 0x1B7F,
  1448. 2 => 54,
  1449. ),
  1450. 'Sundanese' => array(
  1451. 0 => 0x1B80,
  1452. 1 => 0x1BBF,
  1453. 2 => 55,
  1454. ),
  1455. 'Batak' => array(
  1456. 0 => 0x1BC0,
  1457. 1 => 0x1BFF,
  1458. 2 => 56,
  1459. ),
  1460. 'Lepcha' => array(
  1461. 0 => 0x1C00,
  1462. 1 => 0x1C4F,
  1463. 2 => 57,
  1464. ),
  1465. 'Ol Chiki' => array(
  1466. 0 => 0x1C50,
  1467. 1 => 0x1C7F,
  1468. 2 => 58,
  1469. ),
  1470. 'Vedic Extensions' => array(
  1471. 0 => 0x1CD0,
  1472. 1 => 0x1CFF,
  1473. 2 => 59,
  1474. ),
  1475. 'Phonetic Extensions' => array(
  1476. 0 => 0x1D00,
  1477. 1 => 0x1D7F,
  1478. 2 => 60,
  1479. ),
  1480. 'Phonetic Extensions Supplement' => array(
  1481. 0 => 0x1D80,
  1482. 1 => 0x1DBF,
  1483. 2 => 61,
  1484. ),
  1485. 'Combining Diacritical Marks Supplement' => array(
  1486. 0 => 0x1DC0,
  1487. 1 => 0x1DFF,
  1488. 2 => 62,
  1489. ),
  1490. 'Latin Extended Additional' => array(
  1491. 0 => 0x1E00,
  1492. 1 => 0x1EFF,
  1493. 2 => 63,
  1494. ),
  1495. 'Greek Extended' => array(
  1496. 0 => 0x1F00,
  1497. 1 => 0x1FFF,
  1498. 2 => 64,
  1499. ),
  1500. 'General Punctuation' => array(
  1501. 0 => 0x2000,
  1502. 1 => 0x206F,
  1503. 2 => 65,
  1504. ),
  1505. 'Superscripts and Subscripts' => array(
  1506. 0 => 0x2070,
  1507. 1 => 0x209F,
  1508. 2 => 66,
  1509. ),
  1510. 'Currency Symbols' => array(
  1511. 0 => 0x20A0,
  1512. 1 => 0x20CF,
  1513. 2 => 67,
  1514. ),
  1515. 'Combining Diacritical Marks for Symbols' => array(
  1516. 0 => 0x20D0,
  1517. 1 => 0x20FF,
  1518. 2 => 68,
  1519. ),
  1520. 'Letterlike Symbols' => array(
  1521. 0 => 0x2100,
  1522. 1 => 0x214F,
  1523. 2 => 69,
  1524. ),
  1525. 'Number Forms' => array(
  1526. 0 => 0x2150,
  1527. 1 => 0x218F,
  1528. 2 => 70,
  1529. ),
  1530. 'Arrows' => array(
  1531. 0 => 0x2190,
  1532. 1 => 0x21FF,
  1533. 2 => 71,
  1534. ),
  1535. 'Mathematical Operators' => array(
  1536. 0 => 0x2200,
  1537. 1 => 0x22FF,
  1538. 2 => 72,
  1539. ),
  1540. 'Miscellaneous Technical' => array(
  1541. 0 => 0x2300,
  1542. 1 => 0x23FF,
  1543. 2 => 73,
  1544. ),
  1545. 'Control Pictures' => array(
  1546. 0 => 0x2400,
  1547. 1 => 0x243F,
  1548. 2 => 74,
  1549. ),
  1550. 'Optical Character Recognition' => array(
  1551. 0 => 0x2440,
  1552. 1 => 0x245F,
  1553. 2 => 75,
  1554. ),
  1555. 'Enclosed Alphanumerics' => array(
  1556. 0 => 0x2460,
  1557. 1 => 0x24FF,
  1558. 2 => 76,
  1559. ),
  1560. 'Box Drawing' => array(
  1561. 0 => 0x2500,
  1562. 1 => 0x257F,
  1563. 2 => 77,
  1564. ),
  1565. 'Block Elements' => array(
  1566. 0 => 0x2580,
  1567. 1 => 0x259F,
  1568. 2 => 78,
  1569. ),
  1570. 'Geometric Shapes' => array(
  1571. 0 => 0x25A0,
  1572. 1 => 0x25FF,
  1573. 2 => 79,
  1574. ),
  1575. 'Miscellaneous Symbols' => array(
  1576. 0 => 0x2600,
  1577. 1 => 0x26FF,
  1578. 2 => 80,
  1579. ),
  1580. 'Dingbats' => array(
  1581. 0 => 0x2700,
  1582. 1 => 0x27BF,
  1583. 2 => 81,
  1584. ),
  1585. 'Miscellaneous Mathematical Symbols-A' => array(
  1586. 0 => 0x27C0,
  1587. 1 => 0x27EF,
  1588. 2 => 82,
  1589. ),
  1590. 'Supplemental Arrows-A' => array(
  1591. 0 => 0x27F0,
  1592. 1 => 0x27FF,
  1593. 2 => 83,
  1594. ),
  1595. 'Braille Patterns' => array(
  1596. 0 => 0x2800,
  1597. 1 => 0x28FF,
  1598. 2 => 84,
  1599. ),
  1600. 'Supplemental Arrows-B' => array(
  1601. 0 => 0x2900,
  1602. 1 => 0x297F,
  1603. 2 => 85,
  1604. ),
  1605. 'Miscellaneous Mathematical Symbols-B' => array(
  1606. 0 => 0x2980,
  1607. 1 => 0x29FF,
  1608. 2 => 86,
  1609. ),
  1610. 'Supplemental Mathematical Operators' => array(
  1611. 0 => 0x2A00,
  1612. 1 => 0x2AFF,
  1613. 2 => 87,
  1614. ),
  1615. 'Miscellaneous Symbols and Arrows' => array(
  1616. 0 => 0x2B00,
  1617. 1 => 0x2BFF,
  1618. 2 => 88,
  1619. ),
  1620. 'Glagolitic' => array(
  1621. 0 => 0x2C00,
  1622. 1 => 0x2C5F,
  1623. 2 => 89,
  1624. ),
  1625. 'Latin Extended-C' => array(
  1626. 0 => 0x2C60,
  1627. 1 => 0x2C7F,
  1628. 2 => 90,
  1629. ),
  1630. 'Coptic' => array(
  1631. 0 => 0x2C80,
  1632. 1 => 0x2CFF,
  1633. 2 => 91,
  1634. ),
  1635. 'Georgian Supplement' => array(
  1636. 0 => 0x2D00,
  1637. 1 => 0x2D2F,
  1638. 2 => 92,
  1639. ),
  1640. 'Tifinagh' => array(
  1641. 0 => 0x2D30,
  1642. 1 => 0x2D7F,
  1643. 2 => 93,
  1644. ),
  1645. 'Ethiopic Extended' => array(
  1646. 0 => 0x2D80,
  1647. 1 => 0x2DDF,
  1648. 2 => 94,
  1649. ),
  1650. 'Cyrillic Extended-A' => array(
  1651. 0 => 0x2DE0,
  1652. 1 => 0x2DFF,
  1653. 2 => 95,
  1654. ),
  1655. 'Supplemental Punctuation' => array(
  1656. 0 => 0x2E00,
  1657. 1 => 0x2E7F,
  1658. 2 => 96,
  1659. ),
  1660. 'CJK Radicals Supplement' => array(
  1661. 0 => 0x2E80,
  1662. 1 => 0x2EFF,
  1663. 2 => 97,
  1664. ),
  1665. 'Kangxi Radicals' => array(
  1666. 0 => 0x2F00,
  1667. 1 => 0x2FDF,
  1668. 2 => 98,
  1669. ),
  1670. 'Ideographic Description Characters' => array(
  1671. 0 => 0x2FF0,
  1672. 1 => 0x2FFF,
  1673. 2 => 99,
  1674. ),
  1675. 'CJK Symbols and Punctuation' => array(
  1676. 0 => 0x3000,
  1677. 1 => 0x303F,
  1678. 2 => 100,
  1679. ),
  1680. 'Hiragana' => array(
  1681. 0 => 0x3040,
  1682. 1 => 0x309F,
  1683. 2 => 101,
  1684. ),
  1685. 'Katakana' => array(
  1686. 0 => 0x30A0,
  1687. 1 => 0x30FF,
  1688. 2 => 102,
  1689. ),
  1690. 'Bopomofo' => array(
  1691. 0 => 0x3100,
  1692. 1 => 0x312F,
  1693. 2 => 103,
  1694. ),
  1695. 'Hangul Compatibility Jamo' => array(
  1696. 0 => 0x3130,
  1697. 1 => 0x318F,
  1698. 2 => 104,
  1699. ),
  1700. 'Kanbun' => array(
  1701. 0 => 0x3190,
  1702. 1 => 0x319F,
  1703. 2 => 105,
  1704. ),
  1705. 'Bopomofo Extended' => array(
  1706. 0 => 0x31A0,
  1707. 1 => 0x31BF,
  1708. 2 => 106,
  1709. ),
  1710. 'CJK Strokes' => array(
  1711. 0 => 0x31C0,
  1712. 1 => 0x31EF,
  1713. 2 => 107,
  1714. ),
  1715. 'Katakana Phonetic Extensions' => array(
  1716. 0 => 0x31F0,
  1717. 1 => 0x31FF,
  1718. 2 => 108,
  1719. ),
  1720. 'Enclosed CJK Letters and Months' => array(
  1721. 0 => 0x3200,
  1722. 1 => 0x32FF,
  1723. 2 => 109,
  1724. ),
  1725. 'CJK Compatibility' => array(
  1726. 0 => 0x3300,
  1727. 1 => 0x33FF,
  1728. 2 => 110,
  1729. ),
  1730. 'CJK Unified Ideographs Extension A' => array(
  1731. 0 => 0x3400,
  1732. 1 => 0x4DBF,
  1733. 2 => 111,
  1734. ),
  1735. 'Yijing Hexagram Symbols' => array(
  1736. 0 => 0x4DC0,
  1737. 1 => 0x4DFF,
  1738. 2 => 112,
  1739. ),
  1740. 'CJK Unified Ideographs' => array(
  1741. 0 => 0x4E00,
  1742. 1 => 0x9FFF,
  1743. 2 => 113,
  1744. ),
  1745. 'Yi Syllables' => array(
  1746. 0 => 0xA000,
  1747. 1 => 0xA48F,
  1748. 2 => 114,
  1749. ),
  1750. 'Yi Radicals' => array(
  1751. 0 => 0xA490,
  1752. 1 => 0xA4CF,
  1753. 2 => 115,
  1754. ),
  1755. 'Lisu' => array(
  1756. 0 => 0xA4D0,
  1757. 1 => 0xA4FF,
  1758. 2 => 116,
  1759. ),
  1760. 'Vai' => array(
  1761. 0 => 0xA500,
  1762. 1 => 0xA63F,
  1763. 2 => 117,
  1764. ),
  1765. 'Cyrillic Extended-B' => array(
  1766. 0 => 0xA640,
  1767. 1 => 0xA69F,
  1768. 2 => 118,
  1769. ),
  1770. 'Bamum' => array(
  1771. 0 => 0xA6A0,
  1772. 1 => 0xA6FF,
  1773. 2 => 119,
  1774. ),
  1775. 'Modifier Tone Letters' => array(
  1776. 0 => 0xA700,
  1777. 1 => 0xA71F,
  1778. 2 => 120,
  1779. ),
  1780. 'Latin Extended-D' => array(
  1781. 0 => 0xA720,
  1782. 1 => 0xA7FF,
  1783. 2 => 121,
  1784. ),
  1785. 'Syloti Nagri' => array(
  1786. 0 => 0xA800,
  1787. 1 => 0xA82F,
  1788. 2 => 122,
  1789. ),
  1790. 'Common Indic Number Forms' => array(
  1791. 0 => 0xA830,
  1792. 1 => 0xA83F,
  1793. 2 => 123,
  1794. ),
  1795. 'Phags-pa' => array(
  1796. 0 => 0xA840,
  1797. 1 => 0xA87F,
  1798. 2 => 124,
  1799. ),
  1800. 'Saurashtra' => array(
  1801. 0 => 0xA880,
  1802. 1 => 0xA8DF,
  1803. 2 => 125,
  1804. ),
  1805. 'Devanagari Extended' => array(
  1806. 0 => 0xA8E0,
  1807. 1 => 0xA8FF,
  1808. 2 => 126,
  1809. ),
  1810. 'Kayah Li' => array(
  1811. 0 => 0xA900,
  1812. 1 => 0xA92F,
  1813. 2 => 127,
  1814. ),
  1815. 'Rejang' => array(
  1816. 0 => 0xA930,
  1817. 1 => 0xA95F,
  1818. 2 => 128,
  1819. ),
  1820. 'Hangul Jamo Extended-A' => array(
  1821. 0 => 0xA960,
  1822. 1 => 0xA97F,
  1823. 2 => 129,
  1824. ),
  1825. 'Javanese' => array(
  1826. 0 => 0xA980,
  1827. 1 => 0xA9DF,
  1828. 2 => 130,
  1829. ),
  1830. 'Cham' => array(
  1831. 0 => 0xAA00,
  1832. 1 => 0xAA5F,
  1833. 2 => 131,
  1834. ),
  1835. 'Myanmar Extended-A' => array(
  1836. 0 => 0xAA60,
  1837. 1 => 0xAA7F,
  1838. 2 => 132,
  1839. ),
  1840. 'Tai Viet' => array(
  1841. 0 => 0xAA80,
  1842. 1 => 0xAADF,
  1843. 2 => 133,
  1844. ),
  1845. 'Ethiopic Extended-A' => array(
  1846. 0 => 0xAB00,
  1847. 1 => 0xAB2F,
  1848. 2 => 134,
  1849. ),
  1850. 'Meetei Mayek' => array(
  1851. 0 => 0xABC0,
  1852. 1 => 0xABFF,
  1853. 2 => 135,
  1854. ),
  1855. 'Hangul Syllables' => array(
  1856. 0 => 0xAC00,
  1857. 1 => 0xD7AF,
  1858. 2 => 136,
  1859. ),
  1860. 'Hangul Jamo Extended-B' => array(
  1861. 0 => 0xD7B0,
  1862. 1 => 0xD7FF,
  1863. 2 => 137,
  1864. ),
  1865. 'High Surrogates' => array(
  1866. 0 => 0xD800,
  1867. 1 => 0xDB7F,
  1868. 2 => 138,
  1869. ),
  1870. 'High Private Use Surrogates' => array(
  1871. 0 => 0xDB80,
  1872. 1 => 0xDBFF,
  1873. 2 => 139,
  1874. ),
  1875. 'Low Surrogates' => array(
  1876. 0 => 0xDC00,
  1877. 1 => 0xDFFF,
  1878. 2 => 140,
  1879. ),
  1880. 'Private Use Area' => array(
  1881. 0 => 0xE000,
  1882. 1 => 0xF8FF,
  1883. 2 => 141,
  1884. ),
  1885. 'CJK Compatibility Ideographs' => array(
  1886. 0 => 0xF900,
  1887. 1 => 0xFAFF,
  1888. 2 => 142,
  1889. ),
  1890. 'Alphabetic Presentation Forms' => array(
  1891. 0 => 0xFB00,
  1892. 1 => 0xFB4F,
  1893. 2 => 143,
  1894. ),
  1895. 'Arabic Presentation Forms-A' => array(
  1896. 0 => 0xFB50,
  1897. 1 => 0xFDFF,
  1898. 2 => 144,
  1899. ),
  1900. 'Variation Selectors' => array(
  1901. 0 => 0xFE00,
  1902. 1 => 0xFE0F,
  1903. 2 => 145,
  1904. ),
  1905. 'Vertical Forms' => array(
  1906. 0 => 0xFE10,
  1907. 1 => 0xFE1F,
  1908. 2 => 146,
  1909. ),
  1910. 'Combining Half Marks' => array(
  1911. 0 => 0xFE20,
  1912. 1 => 0xFE2F,
  1913. 2 => 147,
  1914. ),
  1915. 'CJK Compatibility Forms' => array(
  1916. 0 => 0xFE30,
  1917. 1 => 0xFE4F,
  1918. 2 => 148,
  1919. ),
  1920. 'Small Form Variants' => array(
  1921. 0 => 0xFE50,
  1922. 1 => 0xFE6F,
  1923. 2 => 149,
  1924. ),
  1925. 'Arabic Presentation Forms-B' => array(
  1926. 0 => 0xFE70,
  1927. 1 => 0xFEFF,
  1928. 2 => 150,
  1929. ),
  1930. 'Halfwidth and Fullwidth Forms' => array(
  1931. 0 => 0xFF00,
  1932. 1 => 0xFFEF,
  1933. 2 => 151,
  1934. ),
  1935. 'Specials' => array(
  1936. 0 => 0xFFF0,
  1937. 1 => 0xFFFF,
  1938. 2 => 152,
  1939. ),
  1940. 'Linear B Syllabary' => array(
  1941. 0 => 0x10000,
  1942. 1 => 0x1007F,
  1943. 2 => 153,
  1944. ),
  1945. 'Linear B Ideograms' => array(
  1946. 0 => 0x10080,
  1947. 1 => 0x100FF,
  1948. 2 => 154,
  1949. ),
  1950. 'Aegean Numbers' => array(
  1951. 0 => 0x10100,
  1952. 1 => 0x1013F,
  1953. 2 => 155,
  1954. ),
  1955. 'Ancient Greek Numbers' => array(
  1956. 0 => 0x10140,
  1957. 1 => 0x1018F,
  1958. 2 => 156,
  1959. ),
  1960. 'Ancient Symbols' => array(
  1961. 0 => 0x10190,
  1962. 1 => 0x101CF,
  1963. 2 => 157,
  1964. ),
  1965. 'Phaistos Disc' => array(
  1966. 0 => 0x101D0,
  1967. 1 => 0x101FF,
  1968. 2 => 158,
  1969. ),
  1970. 'Lycian' => array(
  1971. 0 => 0x10280,
  1972. 1 => 0x1029F,
  1973. 2 => 159,
  1974. ),
  1975. 'Carian' => array(
  1976. 0 => 0x102A0,
  1977. 1 => 0x102DF,
  1978. 2 => 160,
  1979. ),
  1980. 'Old Italic' => array(
  1981. 0 => 0x10300,
  1982. 1 => 0x1032F,
  1983. 2 => 161,
  1984. ),
  1985. 'Gothic' => array(
  1986. 0 => 0x10330,
  1987. 1 => 0x1034F,
  1988. 2 => 162,
  1989. ),
  1990. 'Ugaritic' => array(
  1991. 0 => 0x10380,
  1992. 1 => 0x1039F,
  1993. 2 => 163,
  1994. ),
  1995. 'Old Persian' => array(
  1996. 0 => 0x103A0,
  1997. 1 => 0x103DF,
  1998. 2 => 164,
  1999. ),
  2000. 'Deseret' => array(
  2001. 0 => 0x10400,
  2002. 1 => 0x1044F,
  2003. 2 => 165,
  2004. ),
  2005. 'Shavian' => array(
  2006. 0 => 0x10450,
  2007. 1 => 0x1047F,
  2008. 2 => 166,
  2009. ),
  2010. 'Osmanya' => array(
  2011. 0 => 0x10480,
  2012. 1 => 0x104AF,
  2013. 2 => 167,
  2014. ),
  2015. 'Cypriot Syllabary' => array(
  2016. 0 => 0x10800,
  2017. 1 => 0x1083F,
  2018. 2 => 168,
  2019. ),
  2020. 'Imperial Aramaic' => array(
  2021. 0 => 0x10840,
  2022. 1 => 0x1085F,
  2023. 2 => 169,
  2024. ),
  2025. 'Phoenician' => array(
  2026. 0 => 0x10900,
  2027. 1 => 0x1091F,
  2028. 2 => 170,
  2029. ),
  2030. 'Lydian' => array(
  2031. 0 => 0x10920,
  2032. 1 => 0x1093F,
  2033. 2 => 171,
  2034. ),
  2035. 'Kharoshthi' => array(
  2036. 0 => 0x10A00,
  2037. 1 => 0x10A5F,
  2038. 2 => 172,
  2039. ),
  2040. 'Old South Arabian' => array(
  2041. 0 => 0x10A60,
  2042. 1 => 0x10A7F,
  2043. 2 => 173,
  2044. ),
  2045. 'Avestan' => array(
  2046. 0 => 0x10B00,
  2047. 1 => 0x10B3F,
  2048. 2 => 174,
  2049. ),
  2050. 'Inscriptional Parthian' => array(
  2051. 0 => 0x10B40,
  2052. 1 => 0x10B5F,
  2053. 2 => 175,
  2054. ),
  2055. 'Inscriptional Pahlavi' => array(
  2056. 0 => 0x10B60,
  2057. 1 => 0x10B7F,
  2058. 2 => 176,
  2059. ),
  2060. 'Old Turkic' => array(
  2061. 0 => 0x10C00,
  2062. 1 => 0x10C4F,
  2063. 2 => 177,
  2064. ),
  2065. 'Rumi Numeral Symbols' => array(
  2066. 0 => 0x10E60,
  2067. 1 => 0x10E7F,
  2068. 2 => 178,
  2069. ),
  2070. 'Brahmi' => array(
  2071. 0 => 0x11000,
  2072. 1 => 0x1107F,
  2073. 2 => 179,
  2074. ),
  2075. 'Kaithi' => array(
  2076. 0 => 0x11080,
  2077. 1 => 0x110CF,
  2078. 2 => 180,
  2079. ),
  2080. 'Cuneiform' => array(
  2081. 0 => 0x12000,
  2082. 1 => 0x123FF,
  2083. 2 => 181,
  2084. ),
  2085. 'Cuneiform Numbers and Punctuation' => array(
  2086. 0 => 0x12400,
  2087. 1 => 0x1247F,
  2088. 2 => 182,
  2089. ),
  2090. 'Egyptian Hieroglyphs' => array(
  2091. 0 => 0x13000,
  2092. 1 => 0x1342F,
  2093. 2 => 183,
  2094. ),
  2095. 'Bamum Supplement' => array(
  2096. 0 => 0x16800,
  2097. 1 => 0x16A3F,
  2098. 2 => 184,
  2099. ),
  2100. 'Kana Supplement' => array(
  2101. 0 => 0x1B000,
  2102. 1 => 0x1B0FF,
  2103. 2 => 185,
  2104. ),
  2105. 'Byzantine Musical Symbols' => array(
  2106. 0 => 0x1D000,
  2107. 1 => 0x1D0FF,
  2108. 2 => 186,
  2109. ),
  2110. 'Musical Symbols' => array(
  2111. 0 => 0x1D100,
  2112. 1 => 0x1D1FF,
  2113. 2 => 187,
  2114. ),
  2115. 'Ancient Greek Musical Notation' => array(
  2116. 0 => 0x1D200,
  2117. 1 => 0x1D24F,
  2118. 2 => 188,
  2119. ),
  2120. 'Tai Xuan Jing Symbols' => array(
  2121. 0 => 0x1D300,
  2122. 1 => 0x1D35F,
  2123. 2 => 189,
  2124. ),
  2125. 'Counting Rod Numerals' => array(
  2126. 0 => 0x1D360,
  2127. 1 => 0x1D37F,
  2128. 2 => 190,
  2129. ),
  2130. 'Mathematical Alphanumeric Symbols' => array(
  2131. 0 => 0x1D400,
  2132. 1 => 0x1D7FF,
  2133. 2 => 191,
  2134. ),
  2135. 'Mahjong Tiles' => array(
  2136. 0 => 0x1F000,
  2137. 1 => 0x1F02F,
  2138. 2 => 192,
  2139. ),
  2140. 'Domino Tiles' => array(
  2141. 0 => 0x1F030,
  2142. 1 => 0x1F09F,
  2143. 2 => 193,
  2144. ),
  2145. 'Playing Cards' => array(
  2146. 0 => 0x1F0A0,
  2147. 1 => 0x1F0FF,
  2148. 2 => 194,
  2149. ),
  2150. 'Enclosed Alphanumeric Supplement' => array(
  2151. 0 => 0x1F100,
  2152. 1 => 0x1F1FF,
  2153. 2 => 195,
  2154. ),
  2155. 'Enclosed Ideographic Supplement' => array(
  2156. 0 => 0x1F200,
  2157. 1 => 0x1F2FF,
  2158. 2 => 196,
  2159. ),
  2160. 'Miscellaneous Symbols And Pictographs' => array(
  2161. 0 => 0x1F300,
  2162. 1 => 0x1F5FF,
  2163. 2 => 197,
  2164. ),
  2165. 'Emoticons' => array(
  2166. 0 => 0x1F600,
  2167. 1 => 0x1F64F,
  2168. 2 => 198,
  2169. ),
  2170. 'Transport And Map Symbols' => array(
  2171. 0 => 0x1F680,
  2172. 1 => 0x1F6FF,
  2173. 2 => 199,
  2174. ),
  2175. 'Alchemical Symbols' => array(
  2176. 0 => 0x1F700,
  2177. 1 => 0x1F77F,
  2178. 2 => 200,
  2179. ),
  2180. 'CJK Unified Ideographs Extension B' => array(
  2181. 0 => 0x20000,
  2182. 1 => 0x2A6DF,
  2183. 2 => 201,
  2184. ),
  2185. 'CJK Unified Ideographs Extension C' => array(
  2186. 0 => 0x2A700,
  2187. 1 => 0x2B73F,
  2188. 2 => 202,
  2189. ),
  2190. 'CJK Unified Ideographs Extension D' => array(
  2191. 0 => 0x2B740,
  2192. 1 => 0x2B81F,
  2193. 2 => 203,
  2194. ),
  2195. 'CJK Compatibility Ideographs Supplement' => array(
  2196. 0 => 0x2F800,
  2197. 1 => 0x2FA1F,
  2198. 2 => 204,
  2199. ),
  2200. 'Tags' => array(
  2201. 0 => 0xE0000,
  2202. 1 => 0xE007F,
  2203. 2 => 205,
  2204. ),
  2205. 'Variation Selectors Supplement' => array(
  2206. 0 => 0xE0100,
  2207. 1 => 0xE01EF,
  2208. 2 => 206,
  2209. ),
  2210. 'Supplementary Private Use Area-A' => array(
  2211. 0 => 0xF0000,
  2212. 1 => 0xFFFFF,
  2213. 2 => 207,
  2214. ),
  2215. 'Supplementary Private Use Area-B' => array(
  2216. 0 => 0x100000,
  2217. 1 => 0x10FFFF,
  2218. 2 => 208,
  2219. ),
  2220. );
  2221. #calling the methods of this class only statically!
  2222. private function __construct() {}
  2223. /**
  2224. * Remove combining diactrical marks, with possibility of the restore
  2225. * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция)
  2226. *
  2227. * @param string|null $s
  2228. * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen
  2229. * @param bool $is_can_restored
  2230. * @param array|null &$restore_table
  2231. * @return string|bool|null Returns FALSE if error occurred
  2232. */
  2233. public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null)
  2234. {
  2235. if (! ReflectionTypeHint::isValid()) return false;
  2236. if (is_null($s)) return $s;
  2237. if ($additional_chars)
  2238. {
  2239. foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/');
  2240. $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX';
  2241. }
  2242. else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX';
  2243. if (! $is_can_restored) return preg_replace($re, '', $s);
  2244. $restore_table = array();
  2245. $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE);
  2246. $c = count($a);
  2247. if ($c === 1) return $s;
  2248. $pos = 0;
  2249. $s2 = '';
  2250. for ($i = 0; $i < $c - 1; $i += 2)
  2251. {
  2252. $s2 .= $a[$i];
  2253. #запоминаем символьные (не байтовые!) позиции
  2254. $pos += self::strlen($a[$i]);
  2255. $restore_table['offsets'][$pos] = $a[$i + 1];
  2256. }
  2257. $restore_table['length'] = $pos + self::strlen(end($a));
  2258. return $s2 . end($a);
  2259. }
  2260. /**
  2261. * Restore combining diactrical marks, removed by self::diactrical_remove()
  2262. * In Russian:
  2263. * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились!
  2264. *
  2265. * @see self::diactrical_remove()
  2266. * @param string|null $s
  2267. * @param array $restore_table
  2268. * @return string|bool|null Returns FALSE if error occurred (broken $restore_table)
  2269. */
  2270. public static function diactrical_restore($s, array $restore_table)
  2271. {
  2272. if (! ReflectionTypeHint::isValid()) return false;
  2273. if (is_null($s)) return $s;
  2274. if (! $restore_table) return $s;
  2275. if (! is_int(@$restore_table['length']) ||
  2276. ! is_array(@$restore_table['offsets']) ||
  2277. $restore_table['length'] !== self::strlen($s)) return false;
  2278. $a = array();
  2279. $length = $offset = 0;
  2280. $s2 = '';
  2281. foreach ($restore_table['offsets'] as $pos => $diactricals)
  2282. {
  2283. $length = $pos - $offset;
  2284. $s2 .= self::substr($s, $offset, $length) . $diactricals;
  2285. $offset = $pos;
  2286. }
  2287. return $s2 . self::substr($s, $offset, strlen($s));
  2288. }
  2289. /**
  2290. * Encodes data from another character encoding to UTF-8.
  2291. *
  2292. * @param array|scalar|null $data
  2293. * @param string $charset
  2294. * @return array|scalar|null Returns FALSE if error occurred
  2295. */
  2296. public static function convert_from($data, $charset = 'cp1251')
  2297. {
  2298. if (! ReflectionTypeHint::isValid()) return false;
  2299. return self::_convert($data, $charset, 'UTF-8');
  2300. }
  2301. /**
  2302. * Encodes data from UTF-8 to another character encoding.
  2303. *
  2304. * @param array|scalar|null $data
  2305. * @param string $charset
  2306. * @return array|scalar|null Returns FALSE if error occurred
  2307. */
  2308. public static function convert_to($data, $charset = 'cp1251')
  2309. {
  2310. if (! ReflectionTypeHint::isValid()) return false;
  2311. return self::_convert($data, 'UTF-8', $charset);
  2312. }
  2313. /**
  2314. * Recoding the data of any structure to/from UTF-8.
  2315. * Arrays traversed recursively, recoded keys and values.
  2316. *
  2317. * @see mb_encoding_aliases()
  2318. * @param array|scalar|null $data
  2319. * @param string $charset_from
  2320. * @param string $charset_to
  2321. * @return array|scalar|null Returns FALSE if error occurred
  2322. */
  2323. private static function _convert($data, $charset_from, $charset_to)
  2324. {
  2325. if (! ReflectionTypeHint::isValid()) return false; #for recursive calls
  2326. if ($charset_from === $charset_to) return $data;
  2327. if (is_array($data))
  2328. {
  2329. $d = array();
  2330. foreach ($data as $k => &$v)
  2331. {
  2332. $k = self::_convert($k, $charset_from, $charset_to);
  2333. if ($k === false) return false;
  2334. $d[$k] = self::_convert($v, $charset_from, $charset_to);
  2335. if ($d[$k] === false && ! is_bool($v)) return false;
  2336. }
  2337. return $d;
  2338. }
  2339. if (is_string($data))
  2340. {
  2341. #smart behaviour for errors protected + speed improve
  2342. if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data;
  2343. if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data;
  2344. #since PHP-5.3.x iconv() faster then mb_convert_encoding()
  2345. if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data);
  2346. if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from);
  2347. #charset_from
  2348. if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data);
  2349. if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table);
  2350. if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table);
  2351. if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table);
  2352. if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table);
  2353. if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table);
  2354. #charset_to
  2355. if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table));
  2356. #last trying
  2357. if (function_exists('recode_string'))
  2358. {
  2359. $s = @recode_string($charset_from . '..' . $charset_to, $data);
  2360. if (is_string($s)) return $s;
  2361. }
  2362. trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING);
  2363. return false;
  2364. }
  2365. return $data;
  2366. }
  2367. /**
  2368. * Convert UTF-16 / UCS-2 encoding string to UTF-8.
  2369. * Surrogates UTF-16 are supported!
  2370. *
  2371. * In Russian:
  2372. * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8.
  2373. * Суррогаты UTF-16 поддерживаются!
  2374. *
  2375. * @param string $s
  2376. * @param string $type 'BE' -- big endian byte order
  2377. * 'LE' -- little endian byte order
  2378. * @param bool $to_array returns array chars instead whole string?
  2379. * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred
  2380. */
  2381. private static function _convert_from_utf16($s, $type = 'BE', $to_array = false)
  2382. {
  2383. static $types = array(
  2384. 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order)
  2385. 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order)
  2386. );
  2387. if (! array_key_exists($type, $types))
  2388. {
  2389. trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING);
  2390. return false;
  2391. }
  2392. #the fastest way:
  2393. if (function_exists('iconv') || function_exists('mb_convert_encoding'))
  2394. {
  2395. if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s);
  2396. elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type);
  2397. if (! $to_array) return $s;
  2398. return self::str_split($s);
  2399. }
  2400. /*
  2401. http://en.wikipedia.org/wiki/UTF-16
  2402. The improvement that UTF-16 made over UCS-2 is its ability to encode
  2403. characters in planes 1-16, not just those in plane 0 (BMP).
  2404. UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF)
  2405. using a pair of 16-bit words, known as a surrogate pair.
  2406. First 1000016 is subtracted from the code point to give a 20-bit value.
  2407. This is then split into two separate 10-bit values each of which is represented
  2408. as a surrogate with the most significant half placed in the first surrogate.
  2409. To allow safe use of simple word-oriented string processing, separate ranges
  2410. of values are used for the two surrogates: 0xD800-0xDBFF for the first, most
  2411. significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate.
  2412. For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00,
  2413. and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD.
  2414. Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points
  2415. in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever
  2416. represent a character.
  2417. http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
  2418. http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm
  2419. Conversion of a Unicode scalar value S to a surrogate pair <H, L>:
  2420. H = Math.floor((S - 0x10000) / 0x400) + 0xD800;
  2421. L = ((S - 0x10000) % 0x400) + 0xDC00;
  2422. The conversion of a surrogate pair <H, L> to a scalar value:
  2423. N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000;
  2424. */
  2425. $a = array();
  2426. $hi = false;
  2427. foreach (unpack($types[$type] . '*', $s) as $codepoint)
  2428. {
  2429. #surrogate process
  2430. if ($hi !== false)
  2431. {
  2432. $lo = $codepoint;
  2433. if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char)
  2434. else
  2435. {
  2436. $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000;
  2437. $a[] = self::chr($codepoint);
  2438. }
  2439. $hi = false;
  2440. }
  2441. elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate
  2442. else $hi = $codepoint; #surrogate was found
  2443. }
  2444. return $to_array ? $a : implode('', $a);
  2445. }
  2446. /**
  2447. * Strips out device control codes in the ASCII range.
  2448. *
  2449. * @param string|null String to clean
  2450. * @return string|bool|null Returns FALSE if error occurred
  2451. */
  2452. public static function strict($s)
  2453. {
  2454. if (! ReflectionTypeHint::isValid()) return false;
  2455. if (is_null($s)) return $s;
  2456. return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s);
  2457. }
  2458. /**
  2459. * Check the data accessory to the class of characters ASCII.
  2460. * For null, integer, float, boolean returns TRUE.
  2461. *
  2462. * Массивы обходятся рекурсивно, если в хотябы одном элементе массива
  2463. * его значение не ASCII, возвращается FALSE.
  2464. *
  2465. * @param array|scalar|null $data
  2466. * @return bool
  2467. */
  2468. public static function is_ascii($data)
  2469. {
  2470. if (! ReflectionTypeHint::isValid()) return false;
  2471. if (is_array($data))
  2472. {
  2473. foreach ($data as $k => &$v)
  2474. {
  2475. if (! self::is_ascii($k) || ! self::is_ascii($v)) return false;
  2476. }
  2477. return true;
  2478. }
  2479. #ltrim() little faster then preg_match()
  2480. #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated
  2481. if (is_string($data)) return ltrim($data, "\x00..\x7f") === '';
  2482. if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
  2483. return false; #object or resource
  2484. }
  2485. /**
  2486. * Returns true if data is valid UTF-8 and false otherwise.
  2487. * For null, integer, float, boolean returns TRUE.
  2488. *
  2489. * The arrays are traversed recursively, if At least one element of the array
  2490. * its value is not in UTF-8, returns FALSE.
  2491. *
  2492. * @link http://www.w3.org/International/questions/qa-forms-utf-8.html
  2493. * @link http://ru3.php.net/mb_detect_encoding
  2494. * @link http://webtest.philigon.ru/articles/utf8/
  2495. * @link http://unicode.coeurlumiere.com/
  2496. * @param array|scalar|null $data
  2497. * @param bool $is_strict strict the range of ASCII?
  2498. * @return bool
  2499. */
  2500. public static function is_utf8($data, $is_strict = true)
  2501. {
  2502. if (! ReflectionTypeHint::isValid()) return false;
  2503. if (is_array($data))
  2504. {
  2505. foreach ($data as $k => &$v)
  2506. {
  2507. if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false;
  2508. }
  2509. return true;
  2510. }
  2511. if (is_string($data))
  2512. {
  2513. if (! preg_match('~~suSX', $data)) return false;
  2514. if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
  2515. #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')!
  2516. #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED
  2517. if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) return false;
  2518. return true;
  2519. }
  2520. if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
  2521. return false; #object or resource
  2522. }
  2523. /**
  2524. * Tries to detect if a string is in Unicode encoding
  2525. *
  2526. * @deprecated Slowly, use self::is_utf8() instead
  2527. * @see self::is_utf8()
  2528. * @param string $s текст
  2529. * @param bool $is_strict строгая проверка диапазона ASCII?
  2530. * @return bool
  2531. */
  2532. public static function check($s, $is_strict = true)
  2533. {
  2534. if (! ReflectionTypeHint::isValid()) return false;
  2535. for ($i = 0, $len = strlen($s); $i < $len; $i++)
  2536. {
  2537. $c = ord($s[$i]);
  2538. if ($c < 0x80) #1 byte 0bbbbbbb
  2539. {
  2540. if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue;
  2541. }
  2542. if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb
  2543. elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb
  2544. elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
  2545. elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
  2546. elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
  2547. else return false; #does not match any model
  2548. #n bytes matching 10bbbbbb follow ?
  2549. for ($j = 0; $j < $n; $j++)
  2550. {
  2551. $i++;
  2552. if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false;
  2553. }
  2554. }
  2555. return true;
  2556. }
  2557. /**
  2558. * Check the data in UTF-8 charset on given ranges of the standard UNICODE.
  2559. * The suitable alternative to regular expressions.
  2560. *
  2561. * For null, integer, float, boolean returns TRUE.
  2562. *
  2563. * Arrays traversed recursively (keys and values).
  2564. * At least if one array element value is not passed checking, it returns FALSE.
  2565. *
  2566. * @example
  2567. * #A simple check the standard named ranges:
  2568. * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic'));
  2569. * #You can check the named, direct ranges or codepoints together:
  2570. * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E]
  2571. * array(0x0410, 0x044F), #[A-Яa-я]
  2572. * 0x0401, #russian yo (Ё)
  2573. * 0x0451, #russian ye (ё)
  2574. * 'Arrows',
  2575. * ));
  2576. *
  2577. * @link http://www.unicode.org/charts/
  2578. * @param array|scalar|null $data
  2579. * @param array|string $blocks
  2580. * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам
  2581. * и FALSE в противном случае или для разбитого UTF-8.
  2582. */
  2583. public static function blocks_check($data, $blocks)
  2584. {
  2585. if (! ReflectionTypeHint::isValid()) return false;
  2586. if (is_array($data))
  2587. {
  2588. foreach ($data as $k => &$v)
  2589. {
  2590. if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false;
  2591. }
  2592. return true;
  2593. }
  2594. if (is_string($data))
  2595. {
  2596. $chars = self::str_split($data);
  2597. if ($chars === false) return false; #broken UTF-8
  2598. unset($data); #memory free
  2599. $skip = array(); #save to cache already checked symbols
  2600. foreach ($chars as $i => $char)
  2601. {
  2602. if (array_key_exists($char, $skip)) continue; #speed improve
  2603. $codepoint = self::ord($char);
  2604. if ($codepoint === false) return false; #broken UTF-8
  2605. $is_valid = false;
  2606. $blocks = (array)$blocks;
  2607. foreach ($blocks as $j => $block)
  2608. {
  2609. if (is_string($block))
  2610. {
  2611. if (! array_key_exists($block, self::$unicode_blocks))
  2612. {
  2613. trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING);
  2614. return false;
  2615. }
  2616. list ($min, $max) = self::$unicode_blocks[$block];
  2617. }
  2618. elseif (is_array($block)) list ($min, $max) = $block;
  2619. elseif (is_int($block)) $min = $max = $block;
  2620. else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR);
  2621. if ($codepoint >= $min && $codepoint <= $max)
  2622. {
  2623. $is_valid = true;
  2624. break;
  2625. }
  2626. }#foreach
  2627. if (! $is_valid) return false;
  2628. $skip[$char] = null;
  2629. }#foreach
  2630. return true;
  2631. }
  2632. if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
  2633. return false; #object or resource
  2634. }
  2635. /**
  2636. * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary.
  2637. * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function.
  2638. * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8.
  2639. * For example: ?тест[тест]=тест
  2640. *
  2641. * Алгоритм работы:
  2642. * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES
  2643. * на корректность значений элементов кодировке UTF-8.
  2644. * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8,
  2645. * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть.
  2646. * 3) Сконвертированные значения снова проверяются.
  2647. * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE.
  2648. *
  2649. * NOTICE
  2650. * Функция должна вызываться после self::unescape_request()!
  2651. *
  2652. * @see self::unescape_request()
  2653. * @param bool $is_hex2bin Декодировать HEX-данные?
  2654. * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании
  2655. * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(),
  2656. * а использовать следующий механизм (к тому же кодирующий данные более компактно):
  2657. * '0x' . bin2hex($string)
  2658. * @param string $charset
  2659. * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8
  2660. * и FALSE + E_USER_WARNING в противном случае.
  2661. */
  2662. public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251')
  2663. {
  2664. if (! ReflectionTypeHint::isValid()) return false;
  2665. $is_converted = false;
  2666. $is_broken = false;
  2667. foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v)
  2668. {
  2669. if (! array_key_exists($v, $GLOBALS)) continue;
  2670. #использовать array_walk_recursive() не предоставляется возможным,
  2671. #т.к. его callback функция не поддерживает передачу ключа по ссылке
  2672. $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset);
  2673. if ($is_broken)
  2674. {
  2675. trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING);
  2676. return false;
  2677. }
  2678. }
  2679. if ($is_converted)
  2680. {
  2681. $_REQUEST =
  2682. (isset($_COOKIE) ? $_COOKIE : array()) +
  2683. (isset($_POST) ? $_POST : array()) +
  2684. (isset($_GET) ? $_GET : array());
  2685. }
  2686. return true;
  2687. }
  2688. private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset)
  2689. {
  2690. if ($is_broken) return $data; #speed improve
  2691. if (is_array($data))
  2692. {
  2693. $d = array();
  2694. foreach ($data as $k => &$v)
  2695. {
  2696. $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset);
  2697. if ($is_broken) return $data; #speed improve
  2698. $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset);
  2699. if ($is_broken) return $data; #speed improve
  2700. }
  2701. return $d;
  2702. }
  2703. return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset);
  2704. }
  2705. private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset)
  2706. {
  2707. #regexp speed improve by using strpos()
  2708. if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m))
  2709. {
  2710. $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin()
  2711. $is_converted = true;
  2712. }
  2713. if (! self::is_utf8($s))
  2714. {
  2715. $s = self::convert_from($s, $charset);
  2716. if ($s === false) $is_broken = true;
  2717. elseif (! self::is_utf8($s))
  2718. {
  2719. trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING);
  2720. $is_broken = true;
  2721. }
  2722. else $is_converted = true;
  2723. }
  2724. return $s;
  2725. }
  2726. /**
  2727. * Сравнение строк
  2728. *
  2729. * @param string|null $s1
  2730. * @param string|null $s2
  2731. * @param string $locale For example, 'en_CA', 'ru_RU'
  2732. * @return int|bool|null Returns FALSE if error occurred
  2733. * Returns < 0 if $s1 is less than $s2;
  2734. * > 0 if $s1 is greater than $s2;
  2735. * 0 if they are equal.
  2736. */
  2737. public static function strcmp($s1, $s2, $locale = '')
  2738. {
  2739. if (! ReflectionTypeHint::isValid()) return false;
  2740. if (is_null($s1) || is_null($s2)) return null;
  2741. if (! function_exists('collator_create')) return strcmp($s1, $s2);
  2742. # PHP 5 >= 5.3.0, PECL intl >= 1.0.0
  2743. # If empty string ("") or "root" are passed, UCA rules will be used.
  2744. $c = new Collator($locale);
  2745. if (! $c)
  2746. {
  2747. # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened.
  2748. trigger_error(intl_get_error_message(), E_USER_WARNING);
  2749. return false;
  2750. }
  2751. return $c->compare($s1, $s2);
  2752. }
  2753. /**
  2754. * Сравнение строк для N первых символов
  2755. *
  2756. * @param string|null $s1
  2757. * @param string|null $s2
  2758. * @param int $length
  2759. * @return int|bool|null Returns FALSE if error occurred
  2760. * Returns < 0 if $s1 is less than $s2;
  2761. * > 0 if $s1 is greater than $s2;
  2762. * 0 if they are equal.
  2763. */
  2764. public static function strncmp($s1, $s2, $length)
  2765. {
  2766. if (! ReflectionTypeHint::isValid()) return false;
  2767. if (is_null($s1) || is_null($s2)) return null;
  2768. return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length));
  2769. }
  2770. /**
  2771. * Implementation strcasecmp() function for UTF-8 encoding string.
  2772. *
  2773. * @param string|null $s1
  2774. * @param string|null $s2
  2775. * @return int|bool|null Returns FALSE if error occurred
  2776. * Returns < 0 if $s1 is less than $s2;
  2777. * > 0 if $s1 is greater than $s2;
  2778. * 0 if they are equal.
  2779. */
  2780. public static function strcasecmp($s1, $s2)
  2781. {
  2782. if (! ReflectionTypeHint::isValid()) return false;
  2783. if (is_null($s1) || is_null($s2)) return null;
  2784. return self::strcmp(self::lowercase($s1), self::lowercase($s2));
  2785. }
  2786. /**
  2787. * Converts a UTF-8 string to a UNICODE codepoints
  2788. *
  2789. * @param string|null $s UTF-8 string
  2790. * @return array|bool|null Unicode codepoints
  2791. * Returns FALSE if $s broken (not UTF-8)
  2792. */
  2793. public static function to_unicode($s)
  2794. {
  2795. if (! ReflectionTypeHint::isValid()) return false;
  2796. if (is_null($s)) return $s;
  2797. $s2 = null;
  2798. #since PHP-5.3.x iconv() little faster then mb_convert_encoding()
  2799. if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s);
  2800. elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8');
  2801. if (is_string($s2)) return array_values(unpack('N*', $s2));
  2802. if ($s2 !== null) return false;
  2803. $a = self::str_split($s);
  2804. if ($a === false) return false;
  2805. return array_map(array(__CLASS__, 'ord'), $a);
  2806. }
  2807. /**
  2808. * Converts a UNICODE codepoints to a UTF-8 string
  2809. *
  2810. * @param array|null $a Unicode codepoints
  2811. * @return string|bool|null UTF-8 string
  2812. * Returns FALSE if error occurred
  2813. */
  2814. public static function from_unicode($a)
  2815. {
  2816. if (! ReflectionTypeHint::isValid()) return false;
  2817. if (is_null($a)) return $a;
  2818. #since PHP-5.3.x iconv() little faster then mb_convert_encoding()
  2819. if (function_exists('iconv'))
  2820. {
  2821. array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
  2822. $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a));
  2823. if (! is_string($s)) return false;
  2824. return $s;
  2825. }
  2826. if (function_exists('mb_convert_encoding'))
  2827. {
  2828. array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
  2829. $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE');
  2830. if (! is_string($s)) return false;
  2831. return $s;
  2832. }
  2833. return implode('', array_map(array(__CLASS__, 'chr'), $a));
  2834. }
  2835. /**
  2836. * Converts a UTF-8 character to a UNICODE codepoint
  2837. *
  2838. * @param string|null $char UTF-8 character
  2839. * @return int|bool|null Unicode codepoint
  2840. * Returns FALSE if $char broken (not UTF-8)
  2841. */
  2842. public static function ord($char)
  2843. {
  2844. if (! ReflectionTypeHint::isValid()) return false;
  2845. if (is_null($char)) return $char;
  2846. static $cache = array();
  2847. if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve
  2848. switch (strlen($char))
  2849. {
  2850. case 1 : return $cache[$char] = ord($char);
  2851. case 2 : return $cache[$char] = (ord($char{1}) & 63) |
  2852. ((ord($char{0}) & 31) << 6);
  2853. case 3 : return $cache[$char] = (ord($char{2}) & 63) |
  2854. ((ord($char{1}) & 63) << 6) |
  2855. ((ord($char{0}) & 15) << 12);
  2856. case 4 : return $cache[$char] = (ord($char{3}) & 63) |
  2857. ((ord($char{2}) & 63) << 6) |
  2858. ((ord($char{1}) & 63) << 12) |
  2859. ((ord($char{0}) & 7) << 18);
  2860. default :
  2861. trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING);
  2862. return false;
  2863. }
  2864. }
  2865. /**
  2866. * Converts a UNICODE codepoint to a UTF-8 character
  2867. *
  2868. * @param int|digit|null $cp Unicode codepoint
  2869. * @return string|bool|null UTF-8 character
  2870. * Returns FALSE if error occurred
  2871. */
  2872. public static function chr($cp)
  2873. {
  2874. if (! ReflectionTypeHint::isValid()) return false;
  2875. if (is_null($cp)) return $cp;
  2876. static $cache = array();
  2877. if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve
  2878. if ($cp <= 0x7f) return $cache[$cp] = chr($cp);
  2879. if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) .
  2880. chr(0x80 | ($cp & 0x3f));
  2881. if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) .
  2882. chr(0x80 | (($cp >> 6) & 0x3f)) .
  2883. chr(0x80 | ($cp & 0x3f));
  2884. if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) .
  2885. chr(0x80 | (($cp >> 12) & 0x3f)) .
  2886. chr(0x80 | (($cp >> 6) & 0x3f)) .
  2887. chr(0x80 | ($cp & 0x3f));
  2888. #U+FFFD REPLACEMENT CHARACTER
  2889. return $cache[$cp] = "\xEF\xBF\xBD";
  2890. }
  2891. /**
  2892. * Implementation chunk_split() function for UTF-8 encoding string.
  2893. *
  2894. * @param string|null $s
  2895. * @param int|digit|null $length
  2896. * @param string|null $glue
  2897. * @return string|bool|null Returns FALSE if error occurred
  2898. */
  2899. public static function chunk_split($s, $length = null, $glue = null)
  2900. {
  2901. if (! ReflectionTypeHint::isValid()) return false;
  2902. if (is_null($s)) return $s;
  2903. $length = intval($length);
  2904. $glue = strval($glue);
  2905. if ($length < 1) $length = 76;
  2906. if ($glue === '') $glue = "\r\n";
  2907. if (! is_array($a = self::str_split($s, $length))) return false;
  2908. return implode($glue, $a);
  2909. }
  2910. /**
  2911. * Changes all keys in an array
  2912. *
  2913. * @param array|null $a
  2914. * @param int $mode {CASE_LOWER|CASE_UPPER}
  2915. * @return array|bool|null Returns FALSE if error occurred
  2916. */
  2917. public static function array_change_key_case($a, $mode)
  2918. {
  2919. if (! ReflectionTypeHint::isValid()) return false;
  2920. if (! is_array($a)) return $a;
  2921. $a2 = array();
  2922. foreach ($a as $k => $v)
  2923. {
  2924. if (is_string($k))
  2925. {
  2926. $k = self::convert_case($k, $mode);
  2927. if ($k === false) return false;
  2928. }
  2929. $a2[$k] = $v;
  2930. }
  2931. return $a2;
  2932. }
  2933. /**
  2934. * Конвертирует регистр букв в данных в кодировке UTF-8.
  2935. * Массивы обходятся рекурсивно, при этом конвертируются только значения
  2936. * в элементах массива, а ключи остаются без изменений.
  2937. * Для конвертирования только ключей используйте метод self::array_change_key_case().
  2938. *
  2939. * @see self::array_change_key_case()
  2940. * @link http://www.unicode.org/charts/PDF/U0400.pdf
  2941. * @link http://ru.wikipedia.org/wiki/ISO_639-1
  2942. * @param array|scalar|null $data Данные произвольной структуры
  2943. * @param int $mode {CASE_LOWER|CASE_UPPER}
  2944. * @param bool $is_ascii_optimization for speed improve
  2945. * @return scalar|bool|null Returns FALSE if error occurred
  2946. */
  2947. public static function convert_case($data, $mode, $is_ascii_optimization = true)
  2948. {
  2949. if (! ReflectionTypeHint::isValid()) return false;
  2950. if (is_array($data))
  2951. {
  2952. foreach ($data as $k => &$v) $v = self::convert_case($v, $mode);
  2953. return $data;
  2954. }
  2955. if (! is_string($data) || ! $data) return $data;
  2956. if ($mode === CASE_UPPER)
  2957. {
  2958. if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve!
  2959. #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
  2960. #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8');
  2961. return strtr($data, array_flip(self::$convert_case_table));
  2962. }
  2963. if ($mode === CASE_LOWER)
  2964. {
  2965. if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve!
  2966. #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
  2967. #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8');
  2968. return strtr($data, self::$convert_case_table);
  2969. }
  2970. trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING);
  2971. return $data;
  2972. }
  2973. /**
  2974. * Convert a data to lower case
  2975. *
  2976. * @param array|scalar|null $data
  2977. * @return scalar|bool|null Returns FALSE if error occurred */
  2978. public static function lowercase($data)
  2979. {
  2980. if (! ReflectionTypeHint::isValid()) return false;
  2981. return self::convert_case($data, CASE_LOWER);
  2982. }
  2983. /**
  2984. * Convert a data to upper case
  2985. *
  2986. * @param array|scalar|null $data
  2987. * @return scalar|null Returns FALSE if error occurred
  2988. */
  2989. public static function uppercase($data)
  2990. {
  2991. if (! ReflectionTypeHint::isValid()) return false;
  2992. return self::convert_case($data, CASE_UPPER);
  2993. }
  2994. /**
  2995. * Convert a data to lower case
  2996. *
  2997. * @param array|scalar|null $data
  2998. * @return scalar|bool|null Returns FALSE if error occurred
  2999. */
  3000. public static function strtolower($data)
  3001. {
  3002. if (! ReflectionTypeHint::isValid()) return false;
  3003. return self::convert_case($data, CASE_LOWER);
  3004. }
  3005. /**
  3006. * Convert a data to upper case
  3007. *
  3008. * @param array|scalar|null $data
  3009. * @return scalar|null Returns FALSE if error occurred
  3010. */
  3011. public static function strtoupper($data)
  3012. {
  3013. if (! ReflectionTypeHint::isValid()) return false;
  3014. return self::convert_case($data, CASE_UPPER);
  3015. }
  3016. /**
  3017. * Convert all HTML entities to native UTF-8 characters
  3018. * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode()
  3019. * Все dec и hex сущности так же переводятся в UTF-8.
  3020. *
  3021. * Example: '&quot;' or '&#34;' or '&#x22;' will be converted to '"'.
  3022. *
  3023. * @link http://www.htmlhelp.com/reference/html40/entities/
  3024. * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
  3025. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
  3026. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
  3027. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
  3028. *
  3029. * @param scalar|null $s
  3030. * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (&lt; &gt; &amp; &quot;)
  3031. * @return scalar|null Returns FALSE if error occurred
  3032. */
  3033. public static function html_entity_decode($s, $is_special_chars = false)
  3034. {
  3035. if (! ReflectionTypeHint::isValid()) return false;
  3036. if (! is_string($s)) return $s;
  3037. #speed improve
  3038. if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx;
  3039. || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s;
  3040. $table = self::$html_entity_table;
  3041. if ($is_special_chars) $table += self::$html_special_chars_table;
  3042. #replace named entities
  3043. $s = strtr($s, $table);
  3044. #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster
  3045. if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos))
  3046. {
  3047. foreach (array_unique($m[0]) as $entity)
  3048. {
  3049. if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s);
  3050. }
  3051. }
  3052. #заменяем числовые dec и hex сущности:
  3053. if (strpos($s, '&#') !== false) #speed improve
  3054. {
  3055. $class = __CLASS__;
  3056. $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table);
  3057. $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX',
  3058. function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars)
  3059. {
  3060. $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1];
  3061. if (! $is_special_chars)
  3062. {
  3063. $char = pack('C', $codepoint);
  3064. if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char];
  3065. }
  3066. return $class::chr($codepoint);
  3067. }, $s);
  3068. }
  3069. return $s;
  3070. }
  3071. /**
  3072. * Convert special UTF-8 characters to HTML entities.
  3073. * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities()
  3074. *
  3075. * @link http://www.htmlhelp.com/reference/html40/entities/
  3076. * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
  3077. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
  3078. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
  3079. * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
  3080. *
  3081. * @param scalar|null $s
  3082. * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (&lt; &gt; &amp; &quot;)
  3083. * @return scalar|null Returns FALSE if error occurred
  3084. */
  3085. public static function html_entity_encode($s, $is_special_chars_only = false)
  3086. {
  3087. if (! ReflectionTypeHint::isValid()) return false;
  3088. if (! is_string($s)) return $s;
  3089. #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table));
  3090. if ($is_special_chars_only) return htmlspecialchars($s);
  3091. #replace UTF-8 chars to named entities:
  3092. $s = strtr($s, array_flip(self::$html_entity_table));
  3093. #block below deprecated, since PHP-5.3.x strtr() 3 times faster
  3094. if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes
  3095. | \xe2[\x80-\x99][\x82-\xac] #3 bytes
  3096. )
  3097. ~sxSX', $s, $m))
  3098. {
  3099. $table = array_flip(self::$html_entity_table);
  3100. foreach (array_unique($m[0]) as $char)
  3101. {
  3102. if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s);
  3103. }
  3104. }
  3105. return $s;
  3106. }
  3107. /**
  3108. * Make regular expression for case insensitive match
  3109. * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"
  3110. * Example (only ASCII): "123_test" => "(?i:123_test)"
  3111. *
  3112. * @param string $s
  3113. * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped.
  3114. * This is useful for escaping the delimiter that is required by the PCRE functions.
  3115. * The / is the most commonly used delimiter.
  3116. * @return string|bool|null Returns FALSE if error occurred
  3117. */
  3118. public static function preg_quote_case_insensitive($s, $delimiter = null)
  3119. {
  3120. if (! ReflectionTypeHint::isValid()) return false;
  3121. if (is_null($s)) return $s;
  3122. if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve
  3123. $s_re = '';
  3124. $s_lc = UTF8::lowercase($s); if ($s_lc === false) return false;
  3125. $s_uc = UTF8::uppercase($s); if ($s_uc === false) return false;
  3126. $chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false;
  3127. $chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false;
  3128. foreach ($chars_lc as $i => $char)
  3129. {
  3130. if ($chars_lc[$i] === $chars_uc[$i])
  3131. $s_re .= preg_quote($chars_lc[$i], $delimiter);
  3132. elseif (self::is_ascii($chars_lc[$i]))
  3133. $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']';
  3134. else
  3135. $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|'
  3136. . preg_quote($chars_uc[$i], $delimiter) . ')';
  3137. }
  3138. return $s_re;
  3139. }
  3140. /**
  3141. * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag.
  3142. * This is regardless of whether you use /u modifier.
  3143. *
  3144. * @link http://bolknote.ru/2010/09/08/~2704
  3145. *
  3146. * @param string $pattern
  3147. * @param string|null $subject
  3148. * @param array $matches
  3149. * @param int $flags
  3150. * @param int $char_offset
  3151. * @return array|bool|null Returns FALSE if error occurred
  3152. */
  3153. public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0)
  3154. {
  3155. if (! ReflectionTypeHint::isValid()) return false;
  3156. if (is_null($subject)) return null;
  3157. $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset;
  3158. $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset);
  3159. if ($return === false) return false;
  3160. if ($flags & PREG_OFFSET_CAPTURE)
  3161. {
  3162. foreach ($matches as &$match)
  3163. {
  3164. foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1]));
  3165. }
  3166. }
  3167. return $return;
  3168. }
  3169. #alias for self::str_limit()
  3170. public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20)
  3171. {
  3172. return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length);
  3173. }
  3174. /**
  3175. * Обрезает текст в кодировке UTF-8 до заданной длины,
  3176. * причём последнее слово показывается целиком, а не обрывается на середине.
  3177. * Html сущности корректно обрабатываются.
  3178. *
  3179. * @param string|null $s Текст в кодировке UTF-8
  3180. * @param int|null|digit $maxlength Ограничение длины текста
  3181. * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется
  3182. * @param bool|null &$is_cutted Текст был обрезан?
  3183. * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length,
  3184. * то текст возвращается без изменений
  3185. * @return string|bool|null Returns FALSE if error occurred
  3186. */
  3187. public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "&hellip;"
  3188. {
  3189. if (! ReflectionTypeHint::isValid()) return false;
  3190. if (is_null($s)) return $s;
  3191. $is_cutted = false;
  3192. if ($continue === null) $continue = "\xe2\x80\xa6";
  3193. if (! $maxlength) $maxlength = 256;
  3194. #speed improve block
  3195. #{{{
  3196. if (strlen($s) <= $maxlength) return $s;
  3197. $s2 = str_replace("\r\n", '?', $s);
  3198. $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+
  3199. | \#(?> \d{1,4}
  3200. | x[\da-fA-F]{2,4}
  3201. )
  3202. ); # html сущности (&lt; &gt; &amp; &quot;)
  3203. /sxSX', '?', $s2);
  3204. if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s;
  3205. #}}}
  3206. $r = preg_match_all('/(?> \r\n # переносы строк
  3207. | &(?> [a-zA-Z][a-zA-Z\d]+
  3208. | \#(?> \d{1,4}
  3209. | x[\da-fA-F]{2,4}
  3210. )
  3211. ); # html сущности (&lt; &gt; &amp; &quot;)
  3212. | .
  3213. )
  3214. /sxuSX', $s, $m);
  3215. if ($r === false) return false;
  3216. #d($m);
  3217. if (count($m[0]) <= $maxlength) return $s;
  3218. $left = implode('', array_slice($m[0], 0, $maxlength));
  3219. #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы
  3220. #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx;
  3221. $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F");
  3222. if (strlen($left) !== strlen($left2)) $return = $left2 . $continue;
  3223. else
  3224. {
  3225. #добавляем остаток к обрезанному слову
  3226. $right = implode('', array_slice($m[0], $maxlength));
  3227. preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80!
  3228. | \p{L}+ #буквы
  3229. | \xe2\x80\x9d #закрывающие кавычки
  3230. | \xe2\x80\x99 #закрывающие кавычки
  3231. | \xe2\x80\x9c #закрывающие кавычки
  3232. | \xc2\xbb #закрывающие кавычки
  3233. )+
  3234. /suxSX', $right, $m);
  3235. #d($m);
  3236. $right = isset($m[0]) ? rtrim($m[0], '.-') : '';
  3237. $return = $left . $right;
  3238. if (strlen($return) !== strlen($s)) $return .= $continue;
  3239. }
  3240. if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s;
  3241. $is_cutted = true;
  3242. return $return;
  3243. }
  3244. /**
  3245. * Implementation str_split() function for UTF-8 encoding string.
  3246. *
  3247. * @param string|null $s
  3248. * @param int|null|digit $length
  3249. * @return array|bool|null Returns FALSE if error occurred
  3250. */
  3251. public static function str_split($s, $length = null)
  3252. {
  3253. if (! ReflectionTypeHint::isValid()) return false;
  3254. if (is_null($s)) return $s;
  3255. $length = ($length === null) ? 1 : intval($length);
  3256. if ($length < 1) return false;
  3257. #there are limits in regexp for {min,max}!
  3258. if (preg_match_all('~.~suSX', $s, $m) === false) return false;
  3259. if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
  3260. if ($length === 1) $a = $m[0];
  3261. else
  3262. {
  3263. $a = array();
  3264. for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length));
  3265. }
  3266. return $a;
  3267. }
  3268. /**
  3269. * Implementation strlen() function for UTF-8 encoding string.
  3270. *
  3271. * @param string|null $s
  3272. * @return int|bool|null Returns FALSE if error occurred
  3273. */
  3274. public static function strlen($s)
  3275. {
  3276. if (! ReflectionTypeHint::isValid()) return false;
  3277. if (is_null($s)) return $s;
  3278. //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode())
  3279. if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8');
  3280. /*
  3281. utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright.
  3282. It's much faster than iconv_strlen()
  3283. Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored
  3284. */
  3285. return strlen(utf8_decode($s));
  3286. /*
  3287. #slowly then strlen(utf8_decode())
  3288. if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8');
  3289. #Do not count UTF-8 continuation bytes
  3290. #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s));
  3291. #slowly then strlen(utf8_decode())
  3292. preg_match_all('~.~suSX', $str, $m);
  3293. return count($m[0]);
  3294. #slowly then preg_match_all() + count()
  3295. $n = 0;
  3296. for ($i = 0, $len = strlen($s); $i < $len; $i++)
  3297. {
  3298. $c = ord(substr($s, $i, 1));
  3299. if ($c < 0x80) $n++; #single-byte (0xxxxxx)
  3300. elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx)
  3301. }
  3302. return $n;
  3303. */
  3304. }
  3305. /**
  3306. * Implementation strpos() function for UTF-8 encoding string
  3307. *
  3308. * @param string|null $s The entire string
  3309. * @param string|int $needle The searched substring
  3310. * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
  3311. * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
  3312. * If needle is not found, will return FALSE.
  3313. */
  3314. public static function strpos($s, $needle, $offset = null)
  3315. {
  3316. if (! ReflectionTypeHint::isValid()) return false;
  3317. if (is_null($s)) return $s;
  3318. if ($offset === null || $offset < 0) $offset = 0;
  3319. if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8');
  3320. #iconv_strpos() deprecated, because slowly than self::strlen(substr())
  3321. #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8');
  3322. $byte_pos = $offset;
  3323. do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false;
  3324. while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset);
  3325. return $char_pos;
  3326. }
  3327. /**
  3328. * Find position of first occurrence of a case-insensitive string.
  3329. *
  3330. * @param string|null $s The entire string
  3331. * @param string|int $needle The searched substring
  3332. * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
  3333. * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
  3334. * If needle is not found, will return FALSE.
  3335. */
  3336. public static function stripos($s, $needle, $offset = null)
  3337. {
  3338. if (! ReflectionTypeHint::isValid()) return false;
  3339. if (is_null($s)) return $s;
  3340. if ($offset === null || $offset < 0) $offset = 0;
  3341. if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8');
  3342. #optimization block (speed improve)
  3343. #{{{
  3344. $ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle));
  3345. if ($ascii_int === 1) return false;
  3346. if ($ascii_int === 2) return stripos($s, $needle, $offset);
  3347. #}}}
  3348. $s = self::convert_case($s, CASE_LOWER, false);
  3349. if ($s === false) return false;
  3350. $needle = self::convert_case($needle, CASE_LOWER, false);
  3351. if ($needle === false) return false;
  3352. return self::strpos($s, $needle, $offset);
  3353. }
  3354. /**
  3355. * Implementation strrev() function for UTF-8 encoding string
  3356. *
  3357. * @param string|null $s
  3358. * @return string|bool|null Returns FALSE if error occurred
  3359. */
  3360. public static function strrev($s)
  3361. {
  3362. if (! ReflectionTypeHint::isValid()) return false;
  3363. if (is_null($s)) return $s;
  3364. if (0) #TODO test speed
  3365. {
  3366. $s = self::_convert($s, 'UTF-8', 'UTF-32');
  3367. if (! is_string($s)) return false;
  3368. $s = implode('', array_reverse(str_split($s, 4)));
  3369. return self::_convert($s, 'UTF-32', 'UTF-8');
  3370. }
  3371. if (! is_array($a = self::str_split($s))) return false;
  3372. return implode('', array_reverse($a));
  3373. }
  3374. /**
  3375. * Implementation substr() function for UTF-8 encoding string.
  3376. *
  3377. * @link http://www.w3.org/International/questions/qa-forms-utf-8.html
  3378. * @param string|null $s
  3379. * @param int|digit $offset
  3380. * @param int|null|digit $length
  3381. * @return string|bool|null Returns FALSE if error occurred
  3382. */
  3383. public static function substr($s, $offset, $length = null)
  3384. {
  3385. if (! ReflectionTypeHint::isValid()) return false;
  3386. if (is_null($s)) return $s;
  3387. #since PHP-5.3.x mb_substr() faster then iconv_substr()
  3388. if (function_exists('mb_substr'))
  3389. {
  3390. if ($length === null) $length = self::strlen($s);
  3391. return mb_substr($s, $offset, $length, 'utf-8');
  3392. }
  3393. if (function_exists('iconv_substr'))
  3394. {
  3395. if ($length === null) $length = self::strlen($s);
  3396. return iconv_substr($s, $offset, $length, 'utf-8');
  3397. }
  3398. static $_s = null;
  3399. static $_a = null;
  3400. if ($_s !== $s) $_a = self::str_split($_s = $s);
  3401. if (! is_array($_a)) return false;
  3402. if ($length !== null) $a = array_slice($_a, $offset, $length);
  3403. else $a = array_slice($_a, $offset);
  3404. return implode('', $a);
  3405. }
  3406. /**
  3407. * Implementation substr_replace() function for UTF-8 encoding string.
  3408. *
  3409. * @param string|null $s
  3410. * @param string|int $replacement
  3411. * @param int|digit $start
  3412. * @param int|null $length
  3413. * @return string|bool|null Returns FALSE if error occurred
  3414. */
  3415. public static function substr_replace($s, $replacement, $start, $length = null)
  3416. {
  3417. if (! ReflectionTypeHint::isValid()) return false;
  3418. if (is_null($s)) return $s;
  3419. if (! is_array($a = self::str_split($s))) return false;
  3420. array_splice($a, $start, $length, $replacement);
  3421. return implode('', $a);
  3422. }
  3423. /**
  3424. * Implementation ucfirst() function for UTF-8 encoding string.
  3425. * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр.
  3426. *
  3427. * @param string|null $s
  3428. * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
  3429. * @return string|bool|null Returns FALSE if error occurred
  3430. */
  3431. public static function ucfirst($s, $is_other_to_lowercase = true)
  3432. {
  3433. if (! ReflectionTypeHint::isValid()) return false;
  3434. if (is_null($s)) return $s;
  3435. if ($s === '' || ! is_string($s)) return $s;
  3436. if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false;
  3437. return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]);
  3438. }
  3439. /**
  3440. * Implementation ucwords() function for UTF-8 encoding string.
  3441. * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8,
  3442. * остальные символы каждого слова преобразуются в нижний регистр.
  3443. *
  3444. * @param string|null $s
  3445. * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
  3446. * @param string $spaces_re
  3447. * @return string|bool|null Returns FALSE if error occurred
  3448. */
  3449. public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR
  3450. {
  3451. if (! ReflectionTypeHint::isValid()) return false;
  3452. if (is_null($s)) return $s;
  3453. $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
  3454. foreach ($words as $k => $word)
  3455. {
  3456. $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true);
  3457. if ($words[$k] === false) return false;
  3458. }
  3459. return implode('', $words);
  3460. }
  3461. /**
  3462. * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string.
  3463. *
  3464. * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442",
  3465. * закодированных устаревшей функцией javascript://encode().
  3466. * Рекомендуется использовать функцию javascript://encodeURIComponent().
  3467. *
  3468. * NOTICE
  3469. * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF
  3470. *
  3471. * @param scalar|array|null $data
  3472. * @param bool $is_rawurlencode
  3473. * @return scalar|array|null Returns FALSE if error occurred
  3474. */
  3475. public static function unescape($data, $is_rawurlencode = false)
  3476. {
  3477. if (! ReflectionTypeHint::isValid()) return false;
  3478. if (is_array($data))
  3479. {
  3480. $d = array();
  3481. foreach ($data as $k => &$v)
  3482. {
  3483. $k = self::unescape($k, $is_rawurlencode);
  3484. if ($k === false) return false;
  3485. $d[$k] = self::unescape($v, $is_rawurlencode);
  3486. if ($d[$k] === false && ! is_bool($v)) return false;
  3487. }
  3488. return $d;
  3489. }
  3490. if (is_string($data))
  3491. {
  3492. if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving
  3493. return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2
  3494. | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts
  3495. )
  3496. /sxSX',
  3497. function (array $m) use ($is_rawurlencode)
  3498. {
  3499. $codepoint = hexdec(trim($m[1], '{}'));
  3500. $char = self::chr($codepoint);
  3501. return $is_rawurlencode ? rawurlencode($char) : $char;
  3502. },
  3503. $data);
  3504. }
  3505. if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean
  3506. return false; #object or resource
  3507. }
  3508. /**
  3509. * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST
  3510. * decoded values ​​in the format %uXXXX and %u{XXXXXX}, encoded,
  3511. * for example, through an outdated javascript function escape().
  3512. * Standard PHP5 cannot do it.
  3513. * 2) If in the HTTP_COOKIE there are parameters with the same name,
  3514. * takes the last value, not the first, as in the QUERY_STRING.
  3515. * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream".
  3516. * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data".
  3517. *
  3518. * Сессии, куки и независимая авторизация на поддоменах.
  3519. *
  3520. * ПРИМЕР 1
  3521. * У рабочего сайта http://domain.com появились поддомены.
  3522. * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com"
  3523. * В результате авторизация не работает.
  3524. * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична.
  3525. * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение,
  3526. * а не первое, как в QUERY_STRING.
  3527. * Более подробное описание:
  3528. * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями.
  3529. * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp"
  3530. * В этом случае сервер берёт первое значение, а не последнее.
  3531. * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр.
  3532. * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки:
  3533. * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com)
  3534. * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены)
  3535. * Решение: поменять имя сессии.
  3536. *
  3537. * ПРИМЕР 2
  3538. * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка),
  3539. * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2).
  3540. * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены.
  3541. * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com.
  3542. * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE.
  3543. * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com,
  3544. * для независимой авторизации нужно использовать разные имена сессий.
  3545. * Пример HTTP заголовков ответа сервера:
  3546. * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены)
  3547. * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены)
  3548. *
  3549. * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism
  3550. * @return void
  3551. */
  3552. public static function unescape_request()
  3553. {
  3554. $fixed = false;
  3555. #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"!
  3556. $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null;
  3557. if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA;
  3558. foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null,
  3559. '_POST' => $HTTP_RAW_POST_DATA,
  3560. '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null,
  3561. ) as $k => $v)
  3562. {
  3563. if (! is_string($v)) continue;
  3564. if ($k === '_COOKIE')
  3565. {
  3566. $v = preg_replace('/; *+/sSX', '&', $v);
  3567. unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING
  3568. }
  3569. if (strpos($v, '%u') !== false)
  3570. {
  3571. parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]);
  3572. $fixed = true;
  3573. continue;
  3574. }
  3575. if (array_key_exists($k, $GLOBALS)) continue;
  3576. parse_str($v, $GLOBALS[$k]);
  3577. $fixed = true;
  3578. }
  3579. if ($fixed)
  3580. {
  3581. $_REQUEST =
  3582. (isset($_COOKIE) ? $_COOKIE : array()) +
  3583. (isset($_POST) ? $_POST : array()) +
  3584. (isset($_GET) ? $_GET : array());
  3585. }
  3586. }
  3587. /**
  3588. * Calculates the height of the edit text in <textarea> html tag by value and width.
  3589. *
  3590. * В большинстве случаев будет корректно работать для моноширинных шрифтов.
  3591. * Т.к. браузер переносит последнее слово, которое не умещается на строке,
  3592. * на следующую строку, высота м.б. меньше ожидаемой.
  3593. * Этот алгоритм явл. простым (и быстрым) и не отслеживает переносы слов.
  3594. *
  3595. * @param string|null $s Текст
  3596. * @param int|digit $cols Ширина области редактирования (колонок)
  3597. * @param int|digit $min_rows Минимальное кол-во строк
  3598. * @param int|digit $max_rows Максимальное кол-во строк
  3599. * @return int|bool|null Number of rows (lines)
  3600. */
  3601. public static function textarea_rows($s, $cols, $min_rows = 3, $max_rows = 32)
  3602. {
  3603. if (! ReflectionTypeHint::isValid()) return false;
  3604. if (is_null($s)) return $s;
  3605. if (strlen($s) == 0) return $min_rows; #speed improve
  3606. $rows = 0;
  3607. #utf8_decode() converts characters that are not in ISO-8859-1 to '?'
  3608. foreach (preg_split('/\r\n|[\r\n]/sSX', utf8_decode($s)) as $line)
  3609. {
  3610. $rows += ceil((strlen($line) + 1) / $cols);
  3611. if ($rows > $max_rows) return $max_rows;
  3612. }
  3613. return ($rows < $min_rows) ? $min_rows : $rows;
  3614. }
  3615. /**
  3616. * @param string|null $s
  3617. * @param string|null $charlist
  3618. * @return string|bool|null
  3619. */
  3620. public static function ltrim($s, $charlist = null)
  3621. {
  3622. if (! ReflectionTypeHint::isValid()) return false;
  3623. if (is_null($s)) return $s;
  3624. if ($charlist === null || self::is_ascii($charlist)) return ltrim($s);
  3625. return preg_replace('~^[' . self::_preg_quote_class($charlist, '~') . ']+~suSX', '', $s);
  3626. }
  3627. /**
  3628. * @param string|null $s
  3629. * @param string|null $charlist
  3630. * @return string|bool|null
  3631. */
  3632. public static function rtrim($s, $charlist = null)
  3633. {
  3634. if (! ReflectionTypeHint::isValid()) return false;
  3635. if (is_null($s)) return $s;
  3636. if ($charlist === null || self::is_ascii($charlist)) return rtrim($s);
  3637. return preg_replace('~[' . self::_preg_quote_class($charlist, '~') . ']+$~suSX', '', $s);
  3638. }
  3639. /**
  3640. * @param scalar|null $s
  3641. * @param string|null $charlist
  3642. * @return scalar|null
  3643. */
  3644. public static function trim($s, $charlist = null)
  3645. {
  3646. if (! ReflectionTypeHint::isValid()) return false;
  3647. if (is_null($s)) return $s;
  3648. if ($charlist === null || self::is_ascii($charlist)) return trim($s);
  3649. $charlist_re = self::_preg_quote_class($charlist, '~');
  3650. $s = preg_replace('~^[' . $charlist_re . ']+~suSX', '', $s);
  3651. return preg_replace('~[' . $charlist_re . ']+$~suSX', '', $s);
  3652. }
  3653. private static function _preg_quote_class($charlist, $delimiter = null)
  3654. {
  3655. #return preg_quote($charlist, $delimiter); #DEPRECATED
  3656. $quote_table = array(
  3657. '\\' => '\\\\',
  3658. '-' => '\-',
  3659. ']' => '\]',
  3660. );
  3661. if (is_string($delimiter)) $quote_table[$delimiter] = '\\' . $delimiter;
  3662. return strtr($charlist, $quote_table);
  3663. }
  3664. /**
  3665. * @param string|null $s
  3666. * @param int|digit $length
  3667. * @param string $pad_str
  3668. * @param int $type STR_PAD_LEFT, STR_PAD_RIGHT or STR_PAD_BOTH
  3669. * @return string|bool|null
  3670. */
  3671. public static function str_pad($s, $length, $pad_str = ' ', $type = STR_PAD_RIGHT)
  3672. {
  3673. if (! ReflectionTypeHint::isValid()) return false;
  3674. if (is_null($s)) return $s;
  3675. $input_len = self::strlen($s);
  3676. if ($length <= $input_len) return $s;
  3677. $pad_str_len = self::strlen($pad_str);
  3678. $pad_len = $length - $input_len;
  3679. if ($type == STR_PAD_RIGHT)
  3680. {
  3681. $repeat_num = ceil($pad_len / $pad_str_len);
  3682. return self::substr($s . str_repeat($pad_str, $repeat_num), 0, $length);
  3683. }
  3684. if ($type == STR_PAD_LEFT)
  3685. {
  3686. $repeat_num = ceil($pad_len / $pad_str_len);
  3687. return self::substr(str_repeat($pad_str, $repeat_num), 0, intval(floor($pad_len))) . $s;
  3688. }
  3689. if ($type == STR_PAD_BOTH)
  3690. {
  3691. $pad_len /= 2;
  3692. $pad_amount_left = intval(floor($pad_len));
  3693. $pad_amount_right = intval(ceil($pad_len));
  3694. $repeat_times_left = ceil($pad_amount_left / $pad_str_len);
  3695. $repeat_times_right = ceil($pad_amount_right / $pad_str_len);
  3696. $padding_left = self::substr(str_repeat($pad_str, $repeat_times_left), 0, $pad_amount_left);
  3697. $padding_right = self::substr(str_repeat($pad_str, $repeat_times_right), 0, $pad_amount_right);
  3698. return $padding_left . $s . $padding_right;
  3699. }
  3700. trigger_error('Parameter 4 should be a constant of STR_PAD_RIGHT, STR_PAD_LEFT or STR_PAD_BOTH!', E_USER_WARNING);
  3701. return false;
  3702. }
  3703. /**
  3704. * @param string $str
  3705. * @param string $mask
  3706. * @param int|null $start
  3707. * @param int|null $length
  3708. * @return int|bool
  3709. */
  3710. public static function strspn($str, $mask, $start = null, $length = null)
  3711. {
  3712. if (! ReflectionTypeHint::isValid()) return false;
  3713. #if (self::is_ascii($str) && self::is_ascii($mask)) return strspn($str, $mask, $start, $length);
  3714. if ($start !== null || $length !== null) $str = self::substr($str, $start, $length);
  3715. if (preg_match('~^[' . preg_quote($mask, '~') . ']+~uSX', $str, $m)) self::strlen($m[0]);
  3716. return 0;
  3717. }
  3718. /**
  3719. * Recode the text files in a specified folder in the UTF-8
  3720. * In the processing skipped binary files, files encoded in UTF-8, files that could not convert.
  3721. * So method works reliably enough.
  3722. *
  3723. *
  3724. * @param string $dir Директория для сканирования
  3725. * @param string|null $files_re Регул. выражение для шаблона имён файлов,
  3726. * например: '~\.(?:txt|sql|php|pl|py|sh|tpl|xml|xsl|html|xhtml|phtml|htm|js|json|css|conf|cfg|ini|htaccess)$~sSX'
  3727. * @param bool $is_recursive Обрабатывать вложенные папки и файлы?
  3728. * @param string $charset Исходная кодировка
  3729. * @param string|null $dirs_ignore_re Регул. выражение для исключения папок из обработки
  3730. * например: '~^(?:cache|images?|photos?|fonts?|img|ico|\.svn|\.hg|\.cvs)$~siSX'
  3731. * @param bool $is_echo Печать имён обработанных файлов и статус обработки в выходной поток?
  3732. * @param bool $is_simulate Сымитировать работу без реальной перезаписи файлов?
  3733. * @return int|bool Возвращает кол-во перекодированных файлов
  3734. * Returns FALSE if error occurred
  3735. */
  3736. public static function convert_files_from(
  3737. $dir,
  3738. $files_re = null,
  3739. $is_recursive = true,
  3740. $charset = 'cp1251',
  3741. $dirs_ignore_re = null,
  3742. $is_echo = false,
  3743. $is_simulate = false)
  3744. {
  3745. if (! ReflectionTypeHint::isValid()) return false;
  3746. $dh = opendir($dir);
  3747. if (! is_resource($dh)) return false;
  3748. $counter = 0;
  3749. while (($name = readdir($dh)) !== false)
  3750. {
  3751. if ($name == '.' || $name == '..') continue;
  3752. $file = $dir . '/' . $name;
  3753. if (is_file($file))
  3754. {
  3755. if (is_string($files_re) && ! preg_match($files_re, $name)) continue;
  3756. if ($is_echo) echo $file;
  3757. $s = @file_get_contents($file);
  3758. if (! is_string($s))
  3759. {
  3760. if ($is_echo) echo ' Error to reading' . PHP_EOL;
  3761. return false;
  3762. }
  3763. if (self::is_utf8($s))
  3764. {
  3765. if ($is_echo) echo ' UTF-8' . PHP_EOL;
  3766. continue;
  3767. }
  3768. $s = self::_convert($s, $charset, 'UTF-8');
  3769. #игнорируем ошибки при попытке перекодировать бинарные файлы
  3770. if (! is_string($s) || ! self::is_utf8($s))
  3771. {
  3772. if ($is_echo) echo ' Binary' . PHP_EOL;
  3773. continue;
  3774. }
  3775. $ext = strtolower(pathinfo($name, PATHINFO_EXTENSION));
  3776. if ($ext === 'htm' || $ext === 'html' || $ext === 'xhtml' || $ext === 'phtml' || $ext === 'tpl')
  3777. {
  3778. $s = preg_replace('~(<meta .+? content="text/html; [\x00-\x20]+ charset=) #1
  3779. [-a-zA-Z\d]+
  3780. (" [^>]* >) #2
  3781. ~sixSX', '$1utf-8$2', $s);
  3782. }
  3783. if ($ext === 'xml' || $ext === 'xsl' || $ext === 'tpl')
  3784. {
  3785. $s = preg_replace('~(<\?xml .+? encoding=") #1
  3786. [-a-zA-Z\d]+
  3787. (" .*? \?>) #2
  3788. ~sixSX', '$1utf-8$2', $s);
  3789. }
  3790. if (! $is_simulate)
  3791. {
  3792. $bytes = @file_put_contents($file, $s);
  3793. if ($bytes === false)
  3794. {
  3795. if ($is_echo) echo ' Error to writing' . PHP_EOL;
  3796. return false;
  3797. }
  3798. }
  3799. if ($is_echo) echo ' ' . $charset . ' -> UTF-8' . PHP_EOL;
  3800. $counter++;
  3801. }
  3802. elseif ($is_recursive && is_dir($file))
  3803. {
  3804. if (! is_string($dirs_ignore_re) || ! preg_match($dirs_ignore_re, $name))
  3805. {
  3806. $c = self::convert_files_from($file, $files_re, $is_recursive, $charset, $dirs_ignore_re, $is_echo, $is_simulate);
  3807. if ($c === false) return false;
  3808. $counter += $c;
  3809. }
  3810. }
  3811. }
  3812. closedir($dh);
  3813. return $counter;
  3814. }
  3815. /**
  3816. *
  3817. * @param int|string $low
  3818. * @param int|string $high
  3819. * @param int $step
  3820. * @return array|bool Returns FALSE if error occurred
  3821. */
  3822. public static function range($low, $high, $step = 1)
  3823. {
  3824. if (! ReflectionTypeHint::isValid()) return false;
  3825. if (is_int($low) || is_int($high)) return range($low, $high, $step); #speed improve
  3826. $low_cp = self::ord($low);
  3827. $high_cp = self::ord($high);
  3828. if ($low_cp === false || $high_cp === false) return false;
  3829. $a = range($low_cp, $high_cp, $step);
  3830. return array_map(array('self', 'chr'), $a);
  3831. }
  3832. /**
  3833. *
  3834. * @param string|null $s
  3835. * @param string|array $from
  3836. * @param string|null $to
  3837. * @return string|bool|null Returns FALSE if error occurred
  3838. */
  3839. public static function strtr($s, $from, $to = null)
  3840. {
  3841. if (! ReflectionTypeHint::isValid()) return false;
  3842. if (is_null($s)) return $s;
  3843. if (is_array($from)) return strtr($s, $from); #speed improve
  3844. $keys = self::str_split($from);
  3845. $values = self::str_split($to);
  3846. if ($keys === false || $values === false) return false;
  3847. $table = array_combine($keys, $values);
  3848. if (! is_array($table)) return false;
  3849. return strtr($s, $table);
  3850. }
  3851. public static function tests()
  3852. {
  3853. assert_options(ASSERT_ACTIVE, true);
  3854. assert_options(ASSERT_BAIL, true);
  3855. assert_options(ASSERT_WARNING, true);
  3856. assert_options(ASSERT_QUIET_EVAL, false);
  3857. $a = array(
  3858. 'self::html_entity_decode("&quot;&amp;&lt;&gt;", true) === "\"&<>"',
  3859. 'self::html_entity_decode("&quot;&amp;&lt;&gt;", false) === "&quot;&amp;&lt;&gt;"',
  3860. 'self::html_entity_decode("&amp;amp;", true) === "&amp;"',
  3861. 'self::html_entity_decode("&amp;amp;", false) === "&amp;amp;"',
  3862. 'self::html_entity_decode("&#034;", true) === "\""',
  3863. 'self::html_entity_decode("&#034;", false) === "&quot;"',
  3864. 'self::html_entity_decode("&#039;", true) === "\'"',
  3865. 'self::html_entity_decode("&#039;", false) === "\'"',
  3866. 'self::html_entity_decode("&#x22;", true) === "\""',
  3867. 'self::html_entity_decode("&#x22;", false) === "&quot;"',
  3868. 'self::array_change_key_case(array("АБВГД" => "АБВГД"), CASE_LOWER) === array("абвгд" => "АБВГД")',
  3869. 'self::array_change_key_case(array("абвгд" => "абвгд"), CASE_UPPER) === array("АБВГД" => "абвгд")',
  3870. 'self::blocks_check("Яндекс", "Cyrillic") === true',
  3871. 'self::blocks_check("Google", "Basic Latin") === true',
  3872. 'self::blocks_check("Google & Яндекс", array("Basic Latin", "Cyrillic")) === true',
  3873. 'self::blocks_check("Ё-моё, Yandex!", array(array(0x20, 0x7E), #[\x20-\x7E]
  3874. array(0x0410, 0x044F), #[A-Яa-я]
  3875. 0x0401, #russian yo (Ё)
  3876. 0x0451, #russian ye (ё)
  3877. )) === true',
  3878. 'self::chunk_split("абвг", 2) === "аб\r\nвг"',
  3879. 'self::chunk_split("абвг", 2, "|") === "аб|вг"',
  3880. 'self::lowercase("1234-ABCD-АБВГ") === "1234-abcd-абвг"',
  3881. 'self::lowercase(array("1234-ABCD-АБВГ" => "1234-ABCD-АБВГ")) === array("1234-ABCD-АБВГ" => "1234-abcd-абвг")',
  3882. 'self::uppercase("1234-abcd-абвг") === "1234-ABCD-АБВГ"',
  3883. 'self::uppercase(array("1234-abcd-абвг" => "1234-abcd-абвг")) === array("1234-abcd-абвг" => "1234-ABCD-АБВГ")',
  3884. 'self::convert_from(self::convert_to("123-ABC-abc-АБВ-абв", $charset = "cp1251"), $charset = "cp1251") === "123-ABC-abc-АБВ-абв"',
  3885. 'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние") === "вдох\xc2\xadно\xc2\xadве\xc2\xadние"',
  3886. 'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние", array("\xc2\xad")) === "вдохновение"',
  3887. 'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние", array("\xc2\xad"), true, $restore_table) === "вдохновение"',
  3888. 'self::diactrical_restore("вдохновение", $restore_table) === "вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние"',
  3889. 'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', true)) === true',
  3890. 'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', false)) === true',
  3891. 'self::is_ascii(file_get_contents(' . var_export(__FILE__, true) . ')) === false',
  3892. #range() uses ord() and chr()
  3893. 'self::range("A", "D") === array("A", "B", "C", "D")',
  3894. 'self::range("а", "г") === array("а", "б", "в", "г")',
  3895. 'self::range(1, 3) === array(1, 2, 3)',
  3896. '"↔" === self::chr(self::ord("↔"))',
  3897. '"123-ABC-abc-АБВ-абв" === self::from_unicode(self::to_unicode("123-ABC-abc-АБВ-абв"))',
  3898. 'self::strpos("123-ABC-abc-абв-АБВ-где", "АБВ") === 16',
  3899. 'self::stripos("123-ABC-abc-абд-АБВ-где", "абв") === 16',
  3900. 'self::strpos("123-ABC-abc", "АБВ") === false',
  3901. 'self::strpos("123-АБВ-абв", "abc") === false',
  3902. 'self::preg_quote_case_insensitive("123_слово_test") === "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"',
  3903. 'self::preg_quote_case_insensitive("123_test") === "(?i:123_test)"',
  3904. //'self::strlen(file_get_contents(' . var_export(__FILE__, true) . ', true))'
  3905. );
  3906. foreach ($a as $k => $v) if (! assert($v)) return false;
  3907. //$start_time = microtime(true);
  3908. //$s = file_get_contents(__FILE__);
  3909. //for ($i = 0; $i < 10; $i++) $r = self::html_entity_encode($s);
  3910. //$time = microtime(true) - $start_time;
  3911. //d($time, $r);
  3912. return true;
  3913. }
  3914. }