PageRenderTime 36ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/phputf8/utils/bad.php

https://github.com/dianaprajescu/joomla-framework
PHP | 406 lines | 195 code | 51 blank | 160 comment | 52 complexity | 6f1bc7f377a4639bc72010017afd32f2 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * Tools for locating / replacing bad bytes in UTF-8 strings
  4. * The Original Code is Mozilla Communicator client code.
  5. * The Initial Developer of the Original Code is
  6. * Netscape Communications Corporation.
  7. * Portions created by the Initial Developer are Copyright (C) 1998
  8. * the Initial Developer. All Rights Reserved.
  9. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  10. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  11. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  13. * @see http://hsivonen.iki.fi/php-utf8/
  14. * @package utf8
  15. * @see utf8_is_valid
  16. */
  17. //--------------------------------------------------------------------
  18. /**
  19. * Locates the first bad byte in a UTF-8 string returning it's
  20. * byte index in the string
  21. * PCRE Pattern to locate bad bytes in a UTF-8 string
  22. * Comes from W3 FAQ: Multilingual Forms
  23. * Note: modified to include full ASCII range including control chars
  24. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  25. * @param string
  26. * @return mixed integer byte index or FALSE if no bad found
  27. * @package utf8
  28. */
  29. function utf8_bad_find($str) {
  30. $UTF8_BAD =
  31. '([\x00-\x7F]'. # ASCII (including control chars)
  32. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  33. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  34. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  35. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  36. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  37. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  38. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  39. '|(.{1}))'; # invalid byte
  40. $pos = 0;
  41. $badList = array();
  42. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  43. $bytes = strlen($matches[0]);
  44. if ( isset($matches[2])) {
  45. return $pos;
  46. }
  47. $pos += $bytes;
  48. $str = substr($str,$bytes);
  49. }
  50. return FALSE;
  51. }
  52. //--------------------------------------------------------------------
  53. /**
  54. * Locates all bad bytes in a UTF-8 string and returns a list of their
  55. * byte index in the string
  56. * PCRE Pattern to locate bad bytes in a UTF-8 string
  57. * Comes from W3 FAQ: Multilingual Forms
  58. * Note: modified to include full ASCII range including control chars
  59. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  60. * @param string
  61. * @return mixed array of integers or FALSE if no bad found
  62. * @package utf8
  63. */
  64. function utf8_bad_findall($str) {
  65. $UTF8_BAD =
  66. '([\x00-\x7F]'. # ASCII (including control chars)
  67. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  68. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  69. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  70. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  71. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  72. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  73. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  74. '|(.{1}))'; # invalid byte
  75. $pos = 0;
  76. $badList = array();
  77. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  78. $bytes = strlen($matches[0]);
  79. if ( isset($matches[2])) {
  80. $badList[] = $pos;
  81. }
  82. $pos += $bytes;
  83. $str = substr($str,$bytes);
  84. }
  85. if ( count($badList) > 0 ) {
  86. return $badList;
  87. }
  88. return FALSE;
  89. }
  90. //--------------------------------------------------------------------
  91. /**
  92. * Strips out any bad bytes from a UTF-8 string and returns the rest
  93. * PCRE Pattern to locate bad bytes in a UTF-8 string
  94. * Comes from W3 FAQ: Multilingual Forms
  95. * Note: modified to include full ASCII range including control chars
  96. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  97. * @param string
  98. * @return string
  99. * @package utf8
  100. */
  101. function utf8_bad_strip($str) {
  102. $UTF8_BAD =
  103. '([\x00-\x7F]'. # ASCII (including control chars)
  104. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  105. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  106. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  107. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  108. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  109. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  110. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  111. '|(.{1}))'; # invalid byte
  112. ob_start();
  113. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  114. if ( !isset($matches[2])) {
  115. echo $matches[0];
  116. }
  117. $str = substr($str,strlen($matches[0]));
  118. }
  119. $result = ob_get_contents();
  120. ob_end_clean();
  121. return $result;
  122. }
  123. //--------------------------------------------------------------------
  124. /**
  125. * Replace bad bytes with an alternative character - ASCII character
  126. * recommended is replacement char
  127. * PCRE Pattern to locate bad bytes in a UTF-8 string
  128. * Comes from W3 FAQ: Multilingual Forms
  129. * Note: modified to include full ASCII range including control chars
  130. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  131. * @param string to search
  132. * @param string to replace bad bytes with (defaults to '?') - use ASCII
  133. * @return string
  134. * @package utf8
  135. */
  136. function utf8_bad_replace($str, $replace = '?') {
  137. $UTF8_BAD =
  138. '([\x00-\x7F]'. # ASCII (including control chars)
  139. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  140. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  141. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  142. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  143. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  144. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  145. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  146. '|(.{1}))'; # invalid byte
  147. ob_start();
  148. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  149. if ( !isset($matches[2])) {
  150. echo $matches[0];
  151. } else {
  152. echo $replace;
  153. }
  154. $str = substr($str,strlen($matches[0]));
  155. }
  156. $result = ob_get_contents();
  157. ob_end_clean();
  158. return $result;
  159. }
  160. //--------------------------------------------------------------------
  161. /**
  162. * Return code from utf8_bad_identify() when a five octet sequence is detected.
  163. * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
  164. * do not represent a useful character
  165. * @see utf8_bad_identify
  166. * @package utf8
  167. */
  168. define('UTF8_BAD_5OCTET',1);
  169. /**
  170. * Return code from utf8_bad_identify() when a six octet sequence is detected.
  171. * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
  172. * do not represent a useful character
  173. * @see utf8_bad_identify
  174. * @package utf8
  175. */
  176. define('UTF8_BAD_6OCTET',2);
  177. /**
  178. * Return code from utf8_bad_identify().
  179. * Invalid octet for use as start of multi-byte UTF-8 sequence
  180. * @see utf8_bad_identify
  181. * @package utf8
  182. */
  183. define('UTF8_BAD_SEQID',3);
  184. /**
  185. * Return code from utf8_bad_identify().
  186. * From Unicode 3.1, non-shortest form is illegal
  187. * @see utf8_bad_identify
  188. * @package utf8
  189. */
  190. define('UTF8_BAD_NONSHORT',4);
  191. /**
  192. * Return code from utf8_bad_identify().
  193. * From Unicode 3.2, surrogate characters are illegal
  194. * @see utf8_bad_identify
  195. * @package utf8
  196. */
  197. define('UTF8_BAD_SURROGATE',5);
  198. /**
  199. * Return code from utf8_bad_identify().
  200. * Codepoints outside the Unicode range are illegal
  201. * @see utf8_bad_identify
  202. * @package utf8
  203. */
  204. define('UTF8_BAD_UNIOUTRANGE',6);
  205. /**
  206. * Return code from utf8_bad_identify().
  207. * Incomplete multi-octet sequence
  208. * Note: this is kind of a "catch-all"
  209. * @see utf8_bad_identify
  210. * @package utf8
  211. */
  212. define('UTF8_BAD_SEQINCOMPLETE',7);
  213. //--------------------------------------------------------------------
  214. /**
  215. * Reports on the type of bad byte found in a UTF-8 string. Returns a
  216. * status code on the first bad byte found
  217. * @author <hsivonen@iki.fi>
  218. * @param string UTF-8 encoded string
  219. * @return mixed integer constant describing problem or FALSE if valid UTF-8
  220. * @see utf8_bad_explain
  221. * @see http://hsivonen.iki.fi/php-utf8/
  222. * @package utf8
  223. */
  224. function utf8_bad_identify($str, &$i) {
  225. $mState = 0; // cached expected number of octets after the current octet
  226. // until the beginning of the next UTF8 character sequence
  227. $mUcs4 = 0; // cached Unicode character
  228. $mBytes = 1; // cached expected number of octets in the current sequence
  229. $len = strlen($str);
  230. for($i = 0; $i < $len; $i++) {
  231. $in = ord($str{$i});
  232. if ( $mState == 0) {
  233. // When mState is zero we expect either a US-ASCII character or a
  234. // multi-octet sequence.
  235. if (0 == (0x80 & ($in))) {
  236. // US-ASCII, pass straight through.
  237. $mBytes = 1;
  238. } else if (0xC0 == (0xE0 & ($in))) {
  239. // First octet of 2 octet sequence
  240. $mUcs4 = ($in);
  241. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  242. $mState = 1;
  243. $mBytes = 2;
  244. } else if (0xE0 == (0xF0 & ($in))) {
  245. // First octet of 3 octet sequence
  246. $mUcs4 = ($in);
  247. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  248. $mState = 2;
  249. $mBytes = 3;
  250. } else if (0xF0 == (0xF8 & ($in))) {
  251. // First octet of 4 octet sequence
  252. $mUcs4 = ($in);
  253. $mUcs4 = ($mUcs4 & 0x07) << 18;
  254. $mState = 3;
  255. $mBytes = 4;
  256. } else if (0xF8 == (0xFC & ($in))) {
  257. /* First octet of 5 octet sequence.
  258. *
  259. * This is illegal because the encoded codepoint must be either
  260. * (a) not the shortest form or
  261. * (b) outside the Unicode range of 0-0x10FFFF.
  262. */
  263. return UTF8_BAD_5OCTET;
  264. } else if (0xFC == (0xFE & ($in))) {
  265. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  266. return UTF8_BAD_6OCTET;
  267. } else {
  268. // Current octet is neither in the US-ASCII range nor a legal first
  269. // octet of a multi-octet sequence.
  270. return UTF8_BAD_SEQID;
  271. }
  272. } else {
  273. // When mState is non-zero, we expect a continuation of the multi-octet
  274. // sequence
  275. if (0x80 == (0xC0 & ($in))) {
  276. // Legal continuation.
  277. $shift = ($mState - 1) * 6;
  278. $tmp = $in;
  279. $tmp = ($tmp & 0x0000003F) << $shift;
  280. $mUcs4 |= $tmp;
  281. /**
  282. * End of the multi-octet sequence. mUcs4 now contains the final
  283. * Unicode codepoint to be output
  284. */
  285. if (0 == --$mState) {
  286. // From Unicode 3.1, non-shortest form is illegal
  287. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  288. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  289. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
  290. return UTF8_BAD_NONSHORT;
  291. // From Unicode 3.2, surrogate characters are illegal
  292. } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
  293. return UTF8_BAD_SURROGATE;
  294. // Codepoints outside the Unicode range are illegal
  295. } else if ($mUcs4 > 0x10FFFF) {
  296. return UTF8_BAD_UNIOUTRANGE;
  297. }
  298. //initialize UTF8 cache
  299. $mState = 0;
  300. $mUcs4 = 0;
  301. $mBytes = 1;
  302. }
  303. } else {
  304. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  305. // Incomplete multi-octet sequence.
  306. $i--;
  307. return UTF8_BAD_SEQINCOMPLETE;
  308. }
  309. }
  310. }
  311. if ( $mState != 0 ) {
  312. // Incomplete multi-octet sequence.
  313. $i--;
  314. return UTF8_BAD_SEQINCOMPLETE;
  315. }
  316. // No bad octets found
  317. $i = NULL;
  318. return FALSE;
  319. }
  320. //--------------------------------------------------------------------
  321. /**
  322. * Takes a return code from utf8_bad_identify() are returns a message
  323. * (in English) explaining what the problem is.
  324. * @param int return code from utf8_bad_identify
  325. * @return mixed string message or FALSE if return code unknown
  326. * @see utf8_bad_identify
  327. * @package utf8
  328. */
  329. function utf8_bad_explain($code) {
  330. switch ($code) {
  331. case UTF8_BAD_5OCTET:
  332. return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
  333. break;
  334. case UTF8_BAD_6OCTET:
  335. return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
  336. break;
  337. case UTF8_BAD_SEQID:
  338. return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
  339. break;
  340. case UTF8_BAD_NONSHORT:
  341. return 'From Unicode 3.1, non-shortest form is illegal';
  342. break;
  343. case UTF8_BAD_SURROGATE:
  344. return 'From Unicode 3.2, surrogate characters are illegal';
  345. break;
  346. case UTF8_BAD_UNIOUTRANGE:
  347. return 'Codepoints outside the Unicode range are illegal';
  348. break;
  349. case UTF8_BAD_SEQINCOMPLETE:
  350. return 'Incomplete multi-octet sequence';
  351. break;
  352. }
  353. trigger_error('Unknown error code: '.$code,E_USER_WARNING);
  354. return FALSE;
  355. }