PageRenderTime 45ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/libraries/phputf8/utils/bad.php

https://bitbucket.org/talueses/joomla-cms
PHP | 420 lines | 195 code | 51 blank | 174 comment | 52 complexity | 81e8088b5ed8ac4b6a74e4f3203af34e MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, JSON
  1. <?php
  2. /**
  3. * Tools for locating / replacing bad bytes in UTF-8 strings
  4. * The Original Code is Mozilla Communicator client code.
  5. * The Initial Developer of the Original Code is
  6. * Netscape Communications Corporation.
  7. * Portions created by the Initial Developer are Copyright (C) 1998
  8. * the Initial Developer. All Rights Reserved.
  9. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  10. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  11. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  13. * @see http://hsivonen.iki.fi/php-utf8/
  14. * @package utf8
  15. * @subpackage bad
  16. * @see utf8_is_valid
  17. */
  18. //--------------------------------------------------------------------
  19. /**
  20. * Locates the first bad byte in a UTF-8 string returning it's
  21. * byte index in the string
  22. * PCRE Pattern to locate bad bytes in a UTF-8 string
  23. * Comes from W3 FAQ: Multilingual Forms
  24. * Note: modified to include full ASCII range including control chars
  25. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  26. * @param string
  27. * @return mixed integer byte index or FALSE if no bad found
  28. * @package utf8
  29. * @subpackage bad
  30. */
  31. function utf8_bad_find($str) {
  32. $UTF8_BAD =
  33. '([\x00-\x7F]'. # ASCII (including control chars)
  34. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  35. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  36. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  37. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  38. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  39. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  40. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  41. '|(.{1}))'; # invalid byte
  42. $pos = 0;
  43. $badList = array();
  44. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  45. $bytes = strlen($matches[0]);
  46. if ( isset($matches[2])) {
  47. return $pos;
  48. }
  49. $pos += $bytes;
  50. $str = substr($str,$bytes);
  51. }
  52. return FALSE;
  53. }
  54. //--------------------------------------------------------------------
  55. /**
  56. * Locates all bad bytes in a UTF-8 string and returns a list of their
  57. * byte index in the string
  58. * PCRE Pattern to locate bad bytes in a UTF-8 string
  59. * Comes from W3 FAQ: Multilingual Forms
  60. * Note: modified to include full ASCII range including control chars
  61. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  62. * @param string
  63. * @return mixed array of integers or FALSE if no bad found
  64. * @package utf8
  65. * @subpackage bad
  66. */
  67. function utf8_bad_findall($str) {
  68. $UTF8_BAD =
  69. '([\x00-\x7F]'. # ASCII (including control chars)
  70. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  71. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  72. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  73. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  74. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  75. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  76. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  77. '|(.{1}))'; # invalid byte
  78. $pos = 0;
  79. $badList = array();
  80. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  81. $bytes = strlen($matches[0]);
  82. if ( isset($matches[2])) {
  83. $badList[] = $pos;
  84. }
  85. $pos += $bytes;
  86. $str = substr($str,$bytes);
  87. }
  88. if ( count($badList) > 0 ) {
  89. return $badList;
  90. }
  91. return FALSE;
  92. }
  93. //--------------------------------------------------------------------
  94. /**
  95. * Strips out any bad bytes from a UTF-8 string and returns the rest
  96. * PCRE Pattern to locate bad bytes in a UTF-8 string
  97. * Comes from W3 FAQ: Multilingual Forms
  98. * Note: modified to include full ASCII range including control chars
  99. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  100. * @param string
  101. * @return string
  102. * @package utf8
  103. * @subpackage bad
  104. */
  105. function utf8_bad_strip($str) {
  106. $UTF8_BAD =
  107. '([\x00-\x7F]'. # ASCII (including control chars)
  108. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  109. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  110. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  111. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  112. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  113. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  114. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  115. '|(.{1}))'; # invalid byte
  116. ob_start();
  117. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  118. if ( !isset($matches[2])) {
  119. echo $matches[0];
  120. }
  121. $str = substr($str,strlen($matches[0]));
  122. }
  123. $result = ob_get_contents();
  124. ob_end_clean();
  125. return $result;
  126. }
  127. //--------------------------------------------------------------------
  128. /**
  129. * Replace bad bytes with an alternative character - ASCII character
  130. * recommended is replacement char
  131. * PCRE Pattern to locate bad bytes in a UTF-8 string
  132. * Comes from W3 FAQ: Multilingual Forms
  133. * Note: modified to include full ASCII range including control chars
  134. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  135. * @param string to search
  136. * @param string to replace bad bytes with (defaults to '?') - use ASCII
  137. * @return string
  138. * @package utf8
  139. * @subpackage bad
  140. */
  141. function utf8_bad_replace($str, $replace = '?') {
  142. $UTF8_BAD =
  143. '([\x00-\x7F]'. # ASCII (including control chars)
  144. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  145. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  146. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  147. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  148. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  149. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  150. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  151. '|(.{1}))'; # invalid byte
  152. ob_start();
  153. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  154. if ( !isset($matches[2])) {
  155. echo $matches[0];
  156. } else {
  157. echo $replace;
  158. }
  159. $str = substr($str,strlen($matches[0]));
  160. }
  161. $result = ob_get_contents();
  162. ob_end_clean();
  163. return $result;
  164. }
  165. //--------------------------------------------------------------------
  166. /**
  167. * Return code from utf8_bad_identify() when a five octet sequence is detected.
  168. * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
  169. * do not represent a useful character
  170. * @see utf8_bad_identify
  171. * @package utf8
  172. * @subpackage bad
  173. */
  174. define('UTF8_BAD_5OCTET',1);
  175. /**
  176. * Return code from utf8_bad_identify() when a six octet sequence is detected.
  177. * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
  178. * do not represent a useful character
  179. * @see utf8_bad_identify
  180. * @package utf8
  181. * @subpackage bad
  182. */
  183. define('UTF8_BAD_6OCTET',2);
  184. /**
  185. * Return code from utf8_bad_identify().
  186. * Invalid octet for use as start of multi-byte UTF-8 sequence
  187. * @see utf8_bad_identify
  188. * @package utf8
  189. * @subpackage bad
  190. */
  191. define('UTF8_BAD_SEQID',3);
  192. /**
  193. * Return code from utf8_bad_identify().
  194. * From Unicode 3.1, non-shortest form is illegal
  195. * @see utf8_bad_identify
  196. * @package utf8
  197. * @subpackage bad
  198. */
  199. define('UTF8_BAD_NONSHORT',4);
  200. /**
  201. * Return code from utf8_bad_identify().
  202. * From Unicode 3.2, surrogate characters are illegal
  203. * @see utf8_bad_identify
  204. * @package utf8
  205. * @subpackage bad
  206. */
  207. define('UTF8_BAD_SURROGATE',5);
  208. /**
  209. * Return code from utf8_bad_identify().
  210. * Codepoints outside the Unicode range are illegal
  211. * @see utf8_bad_identify
  212. * @package utf8
  213. * @subpackage bad
  214. */
  215. define('UTF8_BAD_UNIOUTRANGE',6);
  216. /**
  217. * Return code from utf8_bad_identify().
  218. * Incomplete multi-octet sequence
  219. * Note: this is kind of a "catch-all"
  220. * @see utf8_bad_identify
  221. * @package utf8
  222. * @subpackage bad
  223. */
  224. define('UTF8_BAD_SEQINCOMPLETE',7);
  225. //--------------------------------------------------------------------
  226. /**
  227. * Reports on the type of bad byte found in a UTF-8 string. Returns a
  228. * status code on the first bad byte found
  229. * @author <hsivonen@iki.fi>
  230. * @param string UTF-8 encoded string
  231. * @return mixed integer constant describing problem or FALSE if valid UTF-8
  232. * @see utf8_bad_explain
  233. * @see http://hsivonen.iki.fi/php-utf8/
  234. * @package utf8
  235. * @subpackage bad
  236. */
  237. function utf8_bad_identify($str, &$i) {
  238. $mState = 0; // cached expected number of octets after the current octet
  239. // until the beginning of the next UTF8 character sequence
  240. $mUcs4 = 0; // cached Unicode character
  241. $mBytes = 1; // cached expected number of octets in the current sequence
  242. $len = strlen($str);
  243. for($i = 0; $i < $len; $i++) {
  244. $in = ord($str{$i});
  245. if ( $mState == 0) {
  246. // When mState is zero we expect either a US-ASCII character or a
  247. // multi-octet sequence.
  248. if (0 == (0x80 & ($in))) {
  249. // US-ASCII, pass straight through.
  250. $mBytes = 1;
  251. } else if (0xC0 == (0xE0 & ($in))) {
  252. // First octet of 2 octet sequence
  253. $mUcs4 = ($in);
  254. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  255. $mState = 1;
  256. $mBytes = 2;
  257. } else if (0xE0 == (0xF0 & ($in))) {
  258. // First octet of 3 octet sequence
  259. $mUcs4 = ($in);
  260. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  261. $mState = 2;
  262. $mBytes = 3;
  263. } else if (0xF0 == (0xF8 & ($in))) {
  264. // First octet of 4 octet sequence
  265. $mUcs4 = ($in);
  266. $mUcs4 = ($mUcs4 & 0x07) << 18;
  267. $mState = 3;
  268. $mBytes = 4;
  269. } else if (0xF8 == (0xFC & ($in))) {
  270. /* First octet of 5 octet sequence.
  271. *
  272. * This is illegal because the encoded codepoint must be either
  273. * (a) not the shortest form or
  274. * (b) outside the Unicode range of 0-0x10FFFF.
  275. */
  276. return UTF8_BAD_5OCTET;
  277. } else if (0xFC == (0xFE & ($in))) {
  278. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  279. return UTF8_BAD_6OCTET;
  280. } else {
  281. // Current octet is neither in the US-ASCII range nor a legal first
  282. // octet of a multi-octet sequence.
  283. return UTF8_BAD_SEQID;
  284. }
  285. } else {
  286. // When mState is non-zero, we expect a continuation of the multi-octet
  287. // sequence
  288. if (0x80 == (0xC0 & ($in))) {
  289. // Legal continuation.
  290. $shift = ($mState - 1) * 6;
  291. $tmp = $in;
  292. $tmp = ($tmp & 0x0000003F) << $shift;
  293. $mUcs4 |= $tmp;
  294. /**
  295. * End of the multi-octet sequence. mUcs4 now contains the final
  296. * Unicode codepoint to be output
  297. */
  298. if (0 == --$mState) {
  299. // From Unicode 3.1, non-shortest form is illegal
  300. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  301. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  302. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
  303. return UTF8_BAD_NONSHORT;
  304. // From Unicode 3.2, surrogate characters are illegal
  305. } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
  306. return UTF8_BAD_SURROGATE;
  307. // Codepoints outside the Unicode range are illegal
  308. } else if ($mUcs4 > 0x10FFFF) {
  309. return UTF8_BAD_UNIOUTRANGE;
  310. }
  311. //initialize UTF8 cache
  312. $mState = 0;
  313. $mUcs4 = 0;
  314. $mBytes = 1;
  315. }
  316. } else {
  317. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  318. // Incomplete multi-octet sequence.
  319. $i--;
  320. return UTF8_BAD_SEQINCOMPLETE;
  321. }
  322. }
  323. }
  324. if ( $mState != 0 ) {
  325. // Incomplete multi-octet sequence.
  326. $i--;
  327. return UTF8_BAD_SEQINCOMPLETE;
  328. }
  329. // No bad octets found
  330. $i = NULL;
  331. return FALSE;
  332. }
  333. //--------------------------------------------------------------------
  334. /**
  335. * Takes a return code from utf8_bad_identify() are returns a message
  336. * (in English) explaining what the problem is.
  337. * @param int return code from utf8_bad_identify
  338. * @return mixed string message or FALSE if return code unknown
  339. * @see utf8_bad_identify
  340. * @package utf8
  341. * @subpackage bad
  342. */
  343. function utf8_bad_explain($code) {
  344. switch ($code) {
  345. case UTF8_BAD_5OCTET:
  346. return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
  347. break;
  348. case UTF8_BAD_6OCTET:
  349. return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
  350. break;
  351. case UTF8_BAD_SEQID:
  352. return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
  353. break;
  354. case UTF8_BAD_NONSHORT:
  355. return 'From Unicode 3.1, non-shortest form is illegal';
  356. break;
  357. case UTF8_BAD_SURROGATE:
  358. return 'From Unicode 3.2, surrogate characters are illegal';
  359. break;
  360. case UTF8_BAD_UNIOUTRANGE:
  361. return 'Codepoints outside the Unicode range are illegal';
  362. break;
  363. case UTF8_BAD_SEQINCOMPLETE:
  364. return 'Incomplete multi-octet sequence';
  365. break;
  366. }
  367. trigger_error('Unknown error code: '.$code,E_USER_WARNING);
  368. return FALSE;
  369. }