PageRenderTime 51ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/system/helper/utf8.php

https://bitbucket.org/jjasko/opencart_serbian
PHP | 835 lines | 661 code | 107 blank | 67 comment | 77 complexity | d8075142c5fadce0ed254e90227e69bd MD5 | raw file
  1. <?php
  2. function utf8_strlen($string) {
  3. return strlen(utf8_decode($string));
  4. }
  5. function utf8_strpos($string, $needle, $offset = NULL) {
  6. if (is_null($offset)) {
  7. $array = explode($needle, $string, 2);
  8. if (count($array) > 1) {
  9. return utf8_strlen($array[0]);
  10. }
  11. return false;
  12. } else {
  13. if (!is_int($offset)) {
  14. trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR);
  15. return false;
  16. }
  17. $string = utf8_substr($string, $offset);
  18. if (false !== ($position = utf8_strpos($string, $needle))) {
  19. return $position + $offset;
  20. }
  21. return false;
  22. }
  23. }
  24. function utf8_strrpos($string, $needle, $offset = NULL) {
  25. if (is_null($offset)) {
  26. $array = explode($needle, $string);
  27. if (count($array) > 1) {
  28. array_pop($array);
  29. $string = join($needle, $array);
  30. return utf8_strlen($string);
  31. }
  32. return false;
  33. } else {
  34. if (!is_int($offset)) {
  35. trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING);
  36. return false;
  37. }
  38. $string = utf8_substr($string, $offset);
  39. if (false !== ($position = utf8_strrpos($string, $needle))) {
  40. return $position + $offset;
  41. }
  42. return false;
  43. }
  44. }
  45. function utf8_substr($string, $offset, $length = null) {
  46. // generates E_NOTICE
  47. // for PHP4 objects, but not PHP5 objects
  48. $string = (string)$string;
  49. $offset = (int)$offset;
  50. if (!is_null($length)) $length = (int)$length;
  51. // handle trivial cases
  52. if ($length === 0) return '';
  53. if ($offset < 0 && $length < 0 && $length < $offset)
  54. return '';
  55. // normalise negative offsets (we could use a tail
  56. // anchored pattern, but they are horribly slow!)
  57. if ($offset < 0) {
  58. // see notes
  59. $strlen = strlen(utf8_decode($string));
  60. $offset = $strlen + $offset;
  61. if ($offset < 0) $offset = 0;
  62. }
  63. $Op = '';
  64. $Lp = '';
  65. // establish a pattern for offset, a
  66. // non-captured group equal in length to offset
  67. if ($offset > 0) {
  68. $Ox = (int)($offset / 65535);
  69. $Oy = $offset%65535;
  70. if ($Ox) {
  71. $Op = '(?:.{65535}){'.$Ox.'}';
  72. }
  73. $Op = '^(?:'.$Op.'.{'.$Oy.'})';
  74. } else {
  75. // offset == 0; just anchor the pattern
  76. $Op = '^';
  77. }
  78. // establish a pattern for length
  79. if (is_null($length)) {
  80. // the rest of the string
  81. $Lp = '(.*)$';
  82. } else {
  83. if (!isset($strlen)) {
  84. // see notes
  85. $strlen = strlen(utf8_decode($string));
  86. }
  87. // another trivial case
  88. if ($offset > $strlen) return '';
  89. if ($length > 0) {
  90. // reduce any length that would
  91. // go passed the end of the string
  92. $length = min($strlen - $offset, $length);
  93. $Lx = (int)( $length / 65535 );
  94. $Ly = $length % 65535;
  95. // negative length requires a captured group
  96. // of length characters
  97. if ($Lx) $Lp = '(?:.{65535}){'.$Lx.'}';
  98. $Lp = '('.$Lp.'.{'.$Ly.'})';
  99. } else if ($length < 0) {
  100. if ( $length < ($offset - $strlen) ) {
  101. return '';
  102. }
  103. $Lx = (int)((-$length) / 65535);
  104. $Ly = (-$length)%65535;
  105. // negative length requires ... capture everything
  106. // except a group of -length characters
  107. // anchored at the tail-end of the string
  108. if ($Lx) $Lp = '(?:.{65535}){'.$Lx.'}';
  109. $Lp = '(.*)(?:'.$Lp.'.{'.$Ly.'})$';
  110. }
  111. }
  112. if (!preg_match( '#' . $Op . $Lp . '#us', $string, $match)) {
  113. return '';
  114. }
  115. return $match[1];
  116. }
  117. function utf8_strtolower($string) {
  118. static $UTF8_UPPER_TO_LOWER = NULL;
  119. if (is_null($UTF8_UPPER_TO_LOWER)) {
  120. $UTF8_UPPER_TO_LOWER = array(
  121. 0x0041 => 0x0061,
  122. 0x03A6 => 0x03C6,
  123. 0x0162 => 0x0163,
  124. 0x00C5 => 0x00E5,
  125. 0x0042 => 0x0062,
  126. 0x0139 => 0x013A,
  127. 0x00C1 => 0x00E1,
  128. 0x0141 => 0x0142,
  129. 0x038E => 0x03CD,
  130. 0x0100 => 0x0101,
  131. 0x0490 => 0x0491,
  132. 0x0394 => 0x03B4,
  133. 0x015A => 0x015B,
  134. 0x0044 => 0x0064,
  135. 0x0393 => 0x03B3,
  136. 0x00D4 => 0x00F4,
  137. 0x042A => 0x044A,
  138. 0x0419 => 0x0439,
  139. 0x0112 => 0x0113,
  140. 0x041C => 0x043C,
  141. 0x015E => 0x015F,
  142. 0x0143 => 0x0144,
  143. 0x00CE => 0x00EE,
  144. 0x040E => 0x045E,
  145. 0x042F => 0x044F,
  146. 0x039A => 0x03BA,
  147. 0x0154 => 0x0155,
  148. 0x0049 => 0x0069,
  149. 0x0053 => 0x0073,
  150. 0x1E1E => 0x1E1F,
  151. 0x0134 => 0x0135,
  152. 0x0427 => 0x0447,
  153. 0x03A0 => 0x03C0,
  154. 0x0418 => 0x0438,
  155. 0x00D3 => 0x00F3,
  156. 0x0420 => 0x0440,
  157. 0x0404 => 0x0454,
  158. 0x0415 => 0x0435,
  159. 0x0429 => 0x0449,
  160. 0x014A => 0x014B,
  161. 0x0411 => 0x0431,
  162. 0x0409 => 0x0459,
  163. 0x1E02 => 0x1E03,
  164. 0x00D6 => 0x00F6,
  165. 0x00D9 => 0x00F9,
  166. 0x004E => 0x006E,
  167. 0x0401 => 0x0451,
  168. 0x03A4 => 0x03C4,
  169. 0x0423 => 0x0443,
  170. 0x015C => 0x015D,
  171. 0x0403 => 0x0453,
  172. 0x03A8 => 0x03C8,
  173. 0x0158 => 0x0159,
  174. 0x0047 => 0x0067,
  175. 0x00C4 => 0x00E4,
  176. 0x0386 => 0x03AC,
  177. 0x0389 => 0x03AE,
  178. 0x0166 => 0x0167,
  179. 0x039E => 0x03BE,
  180. 0x0164 => 0x0165,
  181. 0x0116 => 0x0117,
  182. 0x0108 => 0x0109,
  183. 0x0056 => 0x0076,
  184. 0x00DE => 0x00FE,
  185. 0x0156 => 0x0157,
  186. 0x00DA => 0x00FA,
  187. 0x1E60 => 0x1E61,
  188. 0x1E82 => 0x1E83,
  189. 0x00C2 => 0x00E2,
  190. 0x0118 => 0x0119,
  191. 0x0145 => 0x0146,
  192. 0x0050 => 0x0070,
  193. 0x0150 => 0x0151,
  194. 0x042E => 0x044E,
  195. 0x0128 => 0x0129,
  196. 0x03A7 => 0x03C7,
  197. 0x013D => 0x013E,
  198. 0x0422 => 0x0442,
  199. 0x005A => 0x007A,
  200. 0x0428 => 0x0448,
  201. 0x03A1 => 0x03C1,
  202. 0x1E80 => 0x1E81,
  203. 0x016C => 0x016D,
  204. 0x00D5 => 0x00F5,
  205. 0x0055 => 0x0075,
  206. 0x0176 => 0x0177,
  207. 0x00DC => 0x00FC,
  208. 0x1E56 => 0x1E57,
  209. 0x03A3 => 0x03C3,
  210. 0x041A => 0x043A,
  211. 0x004D => 0x006D,
  212. 0x016A => 0x016B,
  213. 0x0170 => 0x0171,
  214. 0x0424 => 0x0444,
  215. 0x00CC => 0x00EC,
  216. 0x0168 => 0x0169,
  217. 0x039F => 0x03BF,
  218. 0x004B => 0x006B,
  219. 0x00D2 => 0x00F2,
  220. 0x00C0 => 0x00E0,
  221. 0x0414 => 0x0434,
  222. 0x03A9 => 0x03C9,
  223. 0x1E6A => 0x1E6B,
  224. 0x00C3 => 0x00E3,
  225. 0x042D => 0x044D,
  226. 0x0416 => 0x0436,
  227. 0x01A0 => 0x01A1,
  228. 0x010C => 0x010D,
  229. 0x011C => 0x011D,
  230. 0x00D0 => 0x00F0,
  231. 0x013B => 0x013C,
  232. 0x040F => 0x045F,
  233. 0x040A => 0x045A,
  234. 0x00C8 => 0x00E8,
  235. 0x03A5 => 0x03C5,
  236. 0x0046 => 0x0066,
  237. 0x00DD => 0x00FD,
  238. 0x0043 => 0x0063,
  239. 0x021A => 0x021B,
  240. 0x00CA => 0x00EA,
  241. 0x0399 => 0x03B9,
  242. 0x0179 => 0x017A,
  243. 0x00CF => 0x00EF,
  244. 0x01AF => 0x01B0,
  245. 0x0045 => 0x0065,
  246. 0x039B => 0x03BB,
  247. 0x0398 => 0x03B8,
  248. 0x039C => 0x03BC,
  249. 0x040C => 0x045C,
  250. 0x041F => 0x043F,
  251. 0x042C => 0x044C,
  252. 0x00DE => 0x00FE,
  253. 0x00D0 => 0x00F0,
  254. 0x1EF2 => 0x1EF3,
  255. 0x0048 => 0x0068,
  256. 0x00CB => 0x00EB,
  257. 0x0110 => 0x0111,
  258. 0x0413 => 0x0433,
  259. 0x012E => 0x012F,
  260. 0x00C6 => 0x00E6,
  261. 0x0058 => 0x0078,
  262. 0x0160 => 0x0161,
  263. 0x016E => 0x016F,
  264. 0x0391 => 0x03B1,
  265. 0x0407 => 0x0457,
  266. 0x0172 => 0x0173,
  267. 0x0178 => 0x00FF,
  268. 0x004F => 0x006F,
  269. 0x041B => 0x043B,
  270. 0x0395 => 0x03B5,
  271. 0x0425 => 0x0445,
  272. 0x0120 => 0x0121,
  273. 0x017D => 0x017E,
  274. 0x017B => 0x017C,
  275. 0x0396 => 0x03B6,
  276. 0x0392 => 0x03B2,
  277. 0x0388 => 0x03AD,
  278. 0x1E84 => 0x1E85,
  279. 0x0174 => 0x0175,
  280. 0x0051 => 0x0071,
  281. 0x0417 => 0x0437,
  282. 0x1E0A => 0x1E0B,
  283. 0x0147 => 0x0148,
  284. 0x0104 => 0x0105,
  285. 0x0408 => 0x0458,
  286. 0x014C => 0x014D,
  287. 0x00CD => 0x00ED,
  288. 0x0059 => 0x0079,
  289. 0x010A => 0x010B,
  290. 0x038F => 0x03CE,
  291. 0x0052 => 0x0072,
  292. 0x0410 => 0x0430,
  293. 0x0405 => 0x0455,
  294. 0x0402 => 0x0452,
  295. 0x0126 => 0x0127,
  296. 0x0136 => 0x0137,
  297. 0x012A => 0x012B,
  298. 0x038A => 0x03AF,
  299. 0x042B => 0x044B,
  300. 0x004C => 0x006C,
  301. 0x0397 => 0x03B7,
  302. 0x0124 => 0x0125,
  303. 0x0218 => 0x0219,
  304. 0x00DB => 0x00FB,
  305. 0x011E => 0x011F,
  306. 0x041E => 0x043E,
  307. 0x1E40 => 0x1E41,
  308. 0x039D => 0x03BD,
  309. 0x0106 => 0x0107,
  310. 0x03AB => 0x03CB,
  311. 0x0426 => 0x0446,
  312. 0x00DE => 0x00FE,
  313. 0x00C7 => 0x00E7,
  314. 0x03AA => 0x03CA,
  315. 0x0421 => 0x0441,
  316. 0x0412 => 0x0432,
  317. 0x010E => 0x010F,
  318. 0x00D8 => 0x00F8,
  319. 0x0057 => 0x0077,
  320. 0x011A => 0x011B,
  321. 0x0054 => 0x0074,
  322. 0x004A => 0x006A,
  323. 0x040B => 0x045B,
  324. 0x0406 => 0x0456,
  325. 0x0102 => 0x0103,
  326. 0x039B => 0x03BB,
  327. 0x00D1 => 0x00F1,
  328. 0x041D => 0x043D,
  329. 0x038C => 0x03CC,
  330. 0x00C9 => 0x00E9,
  331. 0x00D0 => 0x00F0,
  332. 0x0407 => 0x0457,
  333. 0x0122 => 0x0123
  334. );
  335. }
  336. $unicode = utf8_to_unicode($string);
  337. if (!$unicode) {
  338. return false;
  339. }
  340. $count = count($unicode);
  341. for ($i = 0; $i < $count; $i++){
  342. if (isset($UTF8_UPPER_TO_LOWER[$unicode[$i]]) ) {
  343. $unicode[$i] = $UTF8_UPPER_TO_LOWER[$unicode[$i]];
  344. }
  345. }
  346. return utf8_from_unicode($unicode);
  347. }
  348. function utf8_strtoupper($string) {
  349. static $UTF8_LOWER_TO_UPPER = NULL;
  350. if (is_null($UTF8_LOWER_TO_UPPER)) {
  351. $UTF8_LOWER_TO_UPPER = array(
  352. 0x0061 => 0x0041,
  353. 0x03C6 => 0x03A6,
  354. 0x0163 => 0x0162,
  355. 0x00E5 => 0x00C5,
  356. 0x0062 => 0x0042,
  357. 0x013A => 0x0139,
  358. 0x00E1 => 0x00C1,
  359. 0x0142 => 0x0141,
  360. 0x03CD => 0x038E,
  361. 0x0101 => 0x0100,
  362. 0x0491 => 0x0490,
  363. 0x03B4 => 0x0394,
  364. 0x015B => 0x015A,
  365. 0x0064 => 0x0044,
  366. 0x03B3 => 0x0393,
  367. 0x00F4 => 0x00D4,
  368. 0x044A => 0x042A,
  369. 0x0439 => 0x0419,
  370. 0x0113 => 0x0112,
  371. 0x043C => 0x041C,
  372. 0x015F => 0x015E,
  373. 0x0144 => 0x0143,
  374. 0x00EE => 0x00CE,
  375. 0x045E => 0x040E,
  376. 0x044F => 0x042F,
  377. 0x03BA => 0x039A,
  378. 0x0155 => 0x0154,
  379. 0x0069 => 0x0049,
  380. 0x0073 => 0x0053,
  381. 0x1E1F => 0x1E1E,
  382. 0x0135 => 0x0134,
  383. 0x0447 => 0x0427,
  384. 0x03C0 => 0x03A0,
  385. 0x0438 => 0x0418,
  386. 0x00F3 => 0x00D3,
  387. 0x0440 => 0x0420,
  388. 0x0454 => 0x0404,
  389. 0x0435 => 0x0415,
  390. 0x0449 => 0x0429,
  391. 0x014B => 0x014A,
  392. 0x0431 => 0x0411,
  393. 0x0459 => 0x0409,
  394. 0x1E03 => 0x1E02,
  395. 0x00F6 => 0x00D6,
  396. 0x00F9 => 0x00D9,
  397. 0x006E => 0x004E,
  398. 0x0451 => 0x0401,
  399. 0x03C4 => 0x03A4,
  400. 0x0443 => 0x0423,
  401. 0x015D => 0x015C,
  402. 0x0453 => 0x0403,
  403. 0x03C8 => 0x03A8,
  404. 0x0159 => 0x0158,
  405. 0x0067 => 0x0047,
  406. 0x00E4 => 0x00C4,
  407. 0x03AC => 0x0386,
  408. 0x03AE => 0x0389,
  409. 0x0167 => 0x0166,
  410. 0x03BE => 0x039E,
  411. 0x0165 => 0x0164,
  412. 0x0117 => 0x0116,
  413. 0x0109 => 0x0108,
  414. 0x0076 => 0x0056,
  415. 0x00FE => 0x00DE,
  416. 0x0157 => 0x0156,
  417. 0x00FA => 0x00DA,
  418. 0x1E61 => 0x1E60,
  419. 0x1E83 => 0x1E82,
  420. 0x00E2 => 0x00C2,
  421. 0x0119 => 0x0118,
  422. 0x0146 => 0x0145,
  423. 0x0070 => 0x0050,
  424. 0x0151 => 0x0150,
  425. 0x044E => 0x042E,
  426. 0x0129 => 0x0128,
  427. 0x03C7 => 0x03A7,
  428. 0x013E => 0x013D,
  429. 0x0442 => 0x0422,
  430. 0x007A => 0x005A,
  431. 0x0448 => 0x0428,
  432. 0x03C1 => 0x03A1,
  433. 0x1E81 => 0x1E80,
  434. 0x016D => 0x016C,
  435. 0x00F5 => 0x00D5,
  436. 0x0075 => 0x0055,
  437. 0x0177 => 0x0176,
  438. 0x00FC => 0x00DC,
  439. 0x1E57 => 0x1E56,
  440. 0x03C3 => 0x03A3,
  441. 0x043A => 0x041A,
  442. 0x006D => 0x004D,
  443. 0x016B => 0x016A,
  444. 0x0171 => 0x0170,
  445. 0x0444 => 0x0424,
  446. 0x00EC => 0x00CC,
  447. 0x0169 => 0x0168,
  448. 0x03BF => 0x039F,
  449. 0x006B => 0x004B,
  450. 0x00F2 => 0x00D2,
  451. 0x00E0 => 0x00C0,
  452. 0x0434 => 0x0414,
  453. 0x03C9 => 0x03A9,
  454. 0x1E6B => 0x1E6A,
  455. 0x00E3 => 0x00C3,
  456. 0x044D => 0x042D,
  457. 0x0436 => 0x0416,
  458. 0x01A1 => 0x01A0,
  459. 0x010D => 0x010C,
  460. 0x011D => 0x011C,
  461. 0x00F0 => 0x00D0,
  462. 0x013C => 0x013B,
  463. 0x045F => 0x040F,
  464. 0x045A => 0x040A,
  465. 0x00E8 => 0x00C8,
  466. 0x03C5 => 0x03A5,
  467. 0x0066 => 0x0046,
  468. 0x00FD => 0x00DD,
  469. 0x0063 => 0x0043,
  470. 0x021B => 0x021A,
  471. 0x00EA => 0x00CA,
  472. 0x03B9 => 0x0399,
  473. 0x017A => 0x0179,
  474. 0x00EF => 0x00CF,
  475. 0x01B0 => 0x01AF,
  476. 0x0065 => 0x0045,
  477. 0x03BB => 0x039B,
  478. 0x03B8 => 0x0398,
  479. 0x03BC => 0x039C,
  480. 0x045C => 0x040C,
  481. 0x043F => 0x041F,
  482. 0x044C => 0x042C,
  483. 0x00FE => 0x00DE,
  484. 0x00F0 => 0x00D0,
  485. 0x1EF3 => 0x1EF2,
  486. 0x0068 => 0x0048,
  487. 0x00EB => 0x00CB,
  488. 0x0111 => 0x0110,
  489. 0x0433 => 0x0413,
  490. 0x012F => 0x012E,
  491. 0x00E6 => 0x00C6,
  492. 0x0078 => 0x0058,
  493. 0x0161 => 0x0160,
  494. 0x016F => 0x016E,
  495. 0x03B1 => 0x0391,
  496. 0x0457 => 0x0407,
  497. 0x0173 => 0x0172,
  498. 0x00FF => 0x0178,
  499. 0x006F => 0x004F,
  500. 0x043B => 0x041B,
  501. 0x03B5 => 0x0395,
  502. 0x0445 => 0x0425,
  503. 0x0121 => 0x0120,
  504. 0x017E => 0x017D,
  505. 0x017C => 0x017B,
  506. 0x03B6 => 0x0396,
  507. 0x03B2 => 0x0392,
  508. 0x03AD => 0x0388,
  509. 0x1E85 => 0x1E84,
  510. 0x0175 => 0x0174,
  511. 0x0071 => 0x0051,
  512. 0x0437 => 0x0417,
  513. 0x1E0B => 0x1E0A,
  514. 0x0148 => 0x0147,
  515. 0x0105 => 0x0104,
  516. 0x0458 => 0x0408,
  517. 0x014D => 0x014C,
  518. 0x00ED => 0x00CD,
  519. 0x0079 => 0x0059,
  520. 0x010B => 0x010A,
  521. 0x03CE => 0x038F,
  522. 0x0072 => 0x0052,
  523. 0x0430 => 0x0410,
  524. 0x0455 => 0x0405,
  525. 0x0452 => 0x0402,
  526. 0x0127 => 0x0126,
  527. 0x0137 => 0x0136,
  528. 0x012B => 0x012A,
  529. 0x03AF => 0x038A,
  530. 0x044B => 0x042B,
  531. 0x006C => 0x004C,
  532. 0x03B7 => 0x0397,
  533. 0x0125 => 0x0124,
  534. 0x0219 => 0x0218,
  535. 0x00FB => 0x00DB,
  536. 0x011F => 0x011E,
  537. 0x043E => 0x041E,
  538. 0x1E41 => 0x1E40,
  539. 0x03BD => 0x039D,
  540. 0x0107 => 0x0106,
  541. 0x03CB => 0x03AB,
  542. 0x0446 => 0x0426,
  543. 0x00FE => 0x00DE,
  544. 0x00E7 => 0x00C7,
  545. 0x03CA => 0x03AA,
  546. 0x0441 => 0x0421,
  547. 0x0432 => 0x0412,
  548. 0x010F => 0x010E,
  549. 0x00F8 => 0x00D8,
  550. 0x0077 => 0x0057,
  551. 0x011B => 0x011A,
  552. 0x0074 => 0x0054,
  553. 0x006A => 0x004A,
  554. 0x045B => 0x040B,
  555. 0x0456 => 0x0406,
  556. 0x0103 => 0x0102,
  557. 0x03BB => 0x039B,
  558. 0x00F1 => 0x00D1,
  559. 0x043D => 0x041D,
  560. 0x03CC => 0x038C,
  561. 0x00E9 => 0x00C9,
  562. 0x00F0 => 0x00D0,
  563. 0x0457 => 0x0407,
  564. 0x0123 => 0x0122
  565. );
  566. }
  567. $unicode = utf8_to_unicode($string);
  568. if (!$unicode) {
  569. return false;
  570. }
  571. $count = count($unicode);
  572. for ($i = 0; $i < $count; $i++){
  573. if (isset($UTF8_LOWER_TO_UPPER[$unicode[$i]]) ) {
  574. $unicode[$i] = $UTF8_LOWER_TO_UPPER[$unicode[$i]];
  575. }
  576. }
  577. return utf8_from_unicode($unicode);
  578. }
  579. function utf8_to_unicode($str) {
  580. $mState = 0; // cached expected number of octets after the current octet
  581. // until the beginning of the next UTF8 character sequence
  582. $mUcs4 = 0; // cached Unicode character
  583. $mBytes = 1; // cached expected number of octets in the current sequence
  584. $out = array();
  585. $len = strlen($str);
  586. for($i = 0; $i < $len; $i++) {
  587. $in = ord($str{$i});
  588. if ($mState == 0) {
  589. // When mState is zero we expect either a US-ASCII character or a
  590. // multi-octet sequence.
  591. if (0 == (0x80 & ($in))) {
  592. // US-ASCII, pass straight through.
  593. $out[] = $in;
  594. $mBytes = 1;
  595. } elseif (0xC0 == (0xE0 & ($in))) {
  596. // First octet of 2 octet sequence
  597. $mUcs4 = ($in);
  598. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  599. $mState = 1;
  600. $mBytes = 2;
  601. } elseif (0xE0 == (0xF0 & ($in))) {
  602. // First octet of 3 octet sequence
  603. $mUcs4 = ($in);
  604. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  605. $mState = 2;
  606. $mBytes = 3;
  607. } else if (0xF0 == (0xF8 & ($in))) {
  608. // First octet of 4 octet sequence
  609. $mUcs4 = ($in);
  610. $mUcs4 = ($mUcs4 & 0x07) << 18;
  611. $mState = 3;
  612. $mBytes = 4;
  613. } else if (0xF8 == (0xFC & ($in))) {
  614. /* First octet of 5 octet sequence.
  615. *
  616. * This is illegal because the encoded codepoint must be either
  617. * (a) not the shortest form or
  618. * (b) outside the Unicode range of 0-0x10FFFF.
  619. * Rather than trying to resynchronize, we will carry on until the end
  620. * of the sequence and let the later error handling code catch it.
  621. */
  622. $mUcs4 = ($in);
  623. $mUcs4 = ($mUcs4 & 0x03) << 24;
  624. $mState = 4;
  625. $mBytes = 5;
  626. } else if (0xFC == (0xFE & ($in))) {
  627. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  628. $mUcs4 = ($in);
  629. $mUcs4 = ($mUcs4 & 1) << 30;
  630. $mState = 5;
  631. $mBytes = 6;
  632. } else {
  633. /* Current octet is neither in the US-ASCII range nor a legal first
  634. * octet of a multi-octet sequence.
  635. */
  636. trigger_error('utf8_to_unicode: Illegal sequence identifier ' . 'in UTF-8 at byte ' . $i, E_USER_WARNING);
  637. return FALSE;
  638. }
  639. } else {
  640. // When mState is non-zero, we expect a continuation of the multi-octet
  641. // sequence
  642. if (0x80 == (0xC0 & ($in))) {
  643. // Legal continuation.
  644. $shift = ($mState - 1) * 6;
  645. $tmp = $in;
  646. $tmp = ($tmp & 0x0000003F) << $shift;
  647. $mUcs4 |= $tmp;
  648. /**
  649. * End of the multi-octet sequence. mUcs4 now contains the final
  650. * Unicode codepoint to be output
  651. */
  652. if (0 == --$mState) {
  653. /*
  654. * Check for illegal sequences and codepoints.
  655. */
  656. // From Unicode 3.1, non-shortest form is illegal
  657. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  658. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  659. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  660. (4 < $mBytes) ||
  661. // From Unicode 3.2, surrogate characters are illegal
  662. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  663. // Codepoints outside the Unicode range are illegal
  664. ($mUcs4 > 0x10FFFF)) {
  665. trigger_error('utf8_to_unicode: Illegal sequence or codepoint in UTF-8 at byte ' . $i, E_USER_WARNING);
  666. return false;
  667. }
  668. if (0xFEFF != $mUcs4) {
  669. // BOM is legal but we don't want to output it
  670. $out[] = $mUcs4;
  671. }
  672. //initialize UTF8 cache
  673. $mState = 0;
  674. $mUcs4 = 0;
  675. $mBytes = 1;
  676. }
  677. } else {
  678. /**
  679. *((0xC0 & (*in) != 0x80) && (mState != 0))
  680. * Incomplete multi-octet sequence.
  681. */
  682. trigger_error('utf8_to_unicode: Incomplete multi-octet sequence in UTF-8 at byte ' . $i, E_USER_WARNING);
  683. return false;
  684. }
  685. }
  686. }
  687. return $out;
  688. }
  689. function utf8_from_unicode($arr) {
  690. ob_start();
  691. foreach (array_keys($arr) as $k) {
  692. # ASCII range (including control chars)
  693. if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
  694. echo chr($arr[$k]);
  695. # 2 byte sequence
  696. } elseif ($arr[$k] <= 0x07ff) {
  697. echo chr(0xc0 | ($arr[$k] >> 6));
  698. echo chr(0x80 | ($arr[$k] & 0x003f));
  699. # Byte order mark (skip)
  700. } elseif ($arr[$k] == 0xFEFF) {
  701. // nop -- zap the BOM
  702. # Test for illegal surrogates
  703. } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
  704. // found a surrogate
  705. trigger_error('utf8_from_unicode: Illegal surrogate at index: ' . $k . ', value: ' . $arr[$k], E_USER_WARNING);
  706. return false;
  707. # 3 byte sequence
  708. } elseif ($arr[$k] <= 0xffff) {
  709. echo chr(0xe0 | ($arr[$k] >> 12));
  710. echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
  711. echo chr(0x80 | ($arr[$k] & 0x003f));
  712. # 4 byte sequence
  713. } elseif ($arr[$k] <= 0x10ffff) {
  714. echo chr(0xf0 | ($arr[$k] >> 18));
  715. echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
  716. echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
  717. echo chr(0x80 | ($arr[$k] & 0x3f));
  718. } else {
  719. trigger_error('utf8_from_unicode: Codepoint out of Unicode range ' . 'at index: '.$k.', value: '.$arr[$k], E_USER_WARNING);
  720. // out of range
  721. return FALSE;
  722. }
  723. }
  724. $result = ob_get_contents();
  725. ob_end_clean();
  726. return $result;
  727. }
  728. ?>