PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/phputf8/utils/bad.php

https://bitbucket.org/biojazzard/joomla-eboracast
PHP | 421 lines | 195 code | 51 blank | 175 comment | 52 complexity | 64bf72e82f9bf01e02e69d9293f24325 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0, MIT, BSD-3-Clause
  1. <?php
  2. /**
  3. * @version $Id$
  4. * Tools for locating / replacing bad bytes in UTF-8 strings
  5. * The Original Code is Mozilla Communicator client code.
  6. * The Initial Developer of the Original Code is
  7. * Netscape Communications Corporation.
  8. * Portions created by the Initial Developer are Copyright (C) 1998
  9. * the Initial Developer. All Rights Reserved.
  10. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14. * @see http://hsivonen.iki.fi/php-utf8/
  15. * @package utf8
  16. * @subpackage bad
  17. * @see utf8_is_valid
  18. */
  19. //--------------------------------------------------------------------
  20. /**
  21. * Locates the first bad byte in a UTF-8 string returning it's
  22. * byte index in the string
  23. * PCRE Pattern to locate bad bytes in a UTF-8 string
  24. * Comes from W3 FAQ: Multilingual Forms
  25. * Note: modified to include full ASCII range including control chars
  26. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  27. * @param string
  28. * @return mixed integer byte index or FALSE if no bad found
  29. * @package utf8
  30. * @subpackage bad
  31. */
  32. function utf8_bad_find($str) {
  33. $UTF8_BAD =
  34. '([\x00-\x7F]'. # ASCII (including control chars)
  35. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  36. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  37. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  38. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  39. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  40. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  41. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  42. '|(.{1}))'; # invalid byte
  43. $pos = 0;
  44. $badList = array();
  45. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  46. $bytes = strlen($matches[0]);
  47. if ( isset($matches[2])) {
  48. return $pos;
  49. }
  50. $pos += $bytes;
  51. $str = substr($str,$bytes);
  52. }
  53. return FALSE;
  54. }
  55. //--------------------------------------------------------------------
  56. /**
  57. * Locates all bad bytes in a UTF-8 string and returns a list of their
  58. * byte index in the string
  59. * PCRE Pattern to locate bad bytes in a UTF-8 string
  60. * Comes from W3 FAQ: Multilingual Forms
  61. * Note: modified to include full ASCII range including control chars
  62. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  63. * @param string
  64. * @return mixed array of integers or FALSE if no bad found
  65. * @package utf8
  66. * @subpackage bad
  67. */
  68. function utf8_bad_findall($str) {
  69. $UTF8_BAD =
  70. '([\x00-\x7F]'. # ASCII (including control chars)
  71. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  72. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  73. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  74. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  75. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  76. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  77. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  78. '|(.{1}))'; # invalid byte
  79. $pos = 0;
  80. $badList = array();
  81. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  82. $bytes = strlen($matches[0]);
  83. if ( isset($matches[2])) {
  84. $badList[] = $pos;
  85. }
  86. $pos += $bytes;
  87. $str = substr($str,$bytes);
  88. }
  89. if ( count($badList) > 0 ) {
  90. return $badList;
  91. }
  92. return FALSE;
  93. }
  94. //--------------------------------------------------------------------
  95. /**
  96. * Strips out any bad bytes from a UTF-8 string and returns the rest
  97. * PCRE Pattern to locate bad bytes in a UTF-8 string
  98. * Comes from W3 FAQ: Multilingual Forms
  99. * Note: modified to include full ASCII range including control chars
  100. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  101. * @param string
  102. * @return string
  103. * @package utf8
  104. * @subpackage bad
  105. */
  106. function utf8_bad_strip($str) {
  107. $UTF8_BAD =
  108. '([\x00-\x7F]'. # ASCII (including control chars)
  109. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  110. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  111. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  112. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  113. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  114. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  115. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  116. '|(.{1}))'; # invalid byte
  117. ob_start();
  118. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  119. if ( !isset($matches[2])) {
  120. echo $matches[0];
  121. }
  122. $str = substr($str,strlen($matches[0]));
  123. }
  124. $result = ob_get_contents();
  125. ob_end_clean();
  126. return $result;
  127. }
  128. //--------------------------------------------------------------------
  129. /**
  130. * Replace bad bytes with an alternative character - ASCII character
  131. * recommended is replacement char
  132. * PCRE Pattern to locate bad bytes in a UTF-8 string
  133. * Comes from W3 FAQ: Multilingual Forms
  134. * Note: modified to include full ASCII range including control chars
  135. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  136. * @param string to search
  137. * @param string to replace bad bytes with (defaults to '?') - use ASCII
  138. * @return string
  139. * @package utf8
  140. * @subpackage bad
  141. */
  142. function utf8_bad_replace($str, $replace = '?') {
  143. $UTF8_BAD =
  144. '([\x00-\x7F]'. # ASCII (including control chars)
  145. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  146. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  147. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  148. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  149. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  150. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  151. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  152. '|(.{1}))'; # invalid byte
  153. ob_start();
  154. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  155. if ( !isset($matches[2])) {
  156. echo $matches[0];
  157. } else {
  158. echo $replace;
  159. }
  160. $str = substr($str,strlen($matches[0]));
  161. }
  162. $result = ob_get_contents();
  163. ob_end_clean();
  164. return $result;
  165. }
  166. //--------------------------------------------------------------------
  167. /**
  168. * Return code from utf8_bad_identify() when a five octet sequence is detected.
  169. * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
  170. * do not represent a useful character
  171. * @see utf8_bad_identify
  172. * @package utf8
  173. * @subpackage bad
  174. */
  175. define('UTF8_BAD_5OCTET',1);
  176. /**
  177. * Return code from utf8_bad_identify() when a six octet sequence is detected.
  178. * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
  179. * do not represent a useful character
  180. * @see utf8_bad_identify
  181. * @package utf8
  182. * @subpackage bad
  183. */
  184. define('UTF8_BAD_6OCTET',2);
  185. /**
  186. * Return code from utf8_bad_identify().
  187. * Invalid octet for use as start of multi-byte UTF-8 sequence
  188. * @see utf8_bad_identify
  189. * @package utf8
  190. * @subpackage bad
  191. */
  192. define('UTF8_BAD_SEQID',3);
  193. /**
  194. * Return code from utf8_bad_identify().
  195. * From Unicode 3.1, non-shortest form is illegal
  196. * @see utf8_bad_identify
  197. * @package utf8
  198. * @subpackage bad
  199. */
  200. define('UTF8_BAD_NONSHORT',4);
  201. /**
  202. * Return code from utf8_bad_identify().
  203. * From Unicode 3.2, surrogate characters are illegal
  204. * @see utf8_bad_identify
  205. * @package utf8
  206. * @subpackage bad
  207. */
  208. define('UTF8_BAD_SURROGATE',5);
  209. /**
  210. * Return code from utf8_bad_identify().
  211. * Codepoints outside the Unicode range are illegal
  212. * @see utf8_bad_identify
  213. * @package utf8
  214. * @subpackage bad
  215. */
  216. define('UTF8_BAD_UNIOUTRANGE',6);
  217. /**
  218. * Return code from utf8_bad_identify().
  219. * Incomplete multi-octet sequence
  220. * Note: this is kind of a "catch-all"
  221. * @see utf8_bad_identify
  222. * @package utf8
  223. * @subpackage bad
  224. */
  225. define('UTF8_BAD_SEQINCOMPLETE',7);
  226. //--------------------------------------------------------------------
  227. /**
  228. * Reports on the type of bad byte found in a UTF-8 string. Returns a
  229. * status code on the first bad byte found
  230. * @author <hsivonen@iki.fi>
  231. * @param string UTF-8 encoded string
  232. * @return mixed integer constant describing problem or FALSE if valid UTF-8
  233. * @see utf8_bad_explain
  234. * @see http://hsivonen.iki.fi/php-utf8/
  235. * @package utf8
  236. * @subpackage bad
  237. */
  238. function utf8_bad_identify($str, &$i) {
  239. $mState = 0; // cached expected number of octets after the current octet
  240. // until the beginning of the next UTF8 character sequence
  241. $mUcs4 = 0; // cached Unicode character
  242. $mBytes = 1; // cached expected number of octets in the current sequence
  243. $len = strlen($str);
  244. for($i = 0; $i < $len; $i++) {
  245. $in = ord($str{$i});
  246. if ( $mState == 0) {
  247. // When mState is zero we expect either a US-ASCII character or a
  248. // multi-octet sequence.
  249. if (0 == (0x80 & ($in))) {
  250. // US-ASCII, pass straight through.
  251. $mBytes = 1;
  252. } else if (0xC0 == (0xE0 & ($in))) {
  253. // First octet of 2 octet sequence
  254. $mUcs4 = ($in);
  255. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  256. $mState = 1;
  257. $mBytes = 2;
  258. } else if (0xE0 == (0xF0 & ($in))) {
  259. // First octet of 3 octet sequence
  260. $mUcs4 = ($in);
  261. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  262. $mState = 2;
  263. $mBytes = 3;
  264. } else if (0xF0 == (0xF8 & ($in))) {
  265. // First octet of 4 octet sequence
  266. $mUcs4 = ($in);
  267. $mUcs4 = ($mUcs4 & 0x07) << 18;
  268. $mState = 3;
  269. $mBytes = 4;
  270. } else if (0xF8 == (0xFC & ($in))) {
  271. /* First octet of 5 octet sequence.
  272. *
  273. * This is illegal because the encoded codepoint must be either
  274. * (a) not the shortest form or
  275. * (b) outside the Unicode range of 0-0x10FFFF.
  276. */
  277. return UTF8_BAD_5OCTET;
  278. } else if (0xFC == (0xFE & ($in))) {
  279. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  280. return UTF8_BAD_6OCTET;
  281. } else {
  282. // Current octet is neither in the US-ASCII range nor a legal first
  283. // octet of a multi-octet sequence.
  284. return UTF8_BAD_SEQID;
  285. }
  286. } else {
  287. // When mState is non-zero, we expect a continuation of the multi-octet
  288. // sequence
  289. if (0x80 == (0xC0 & ($in))) {
  290. // Legal continuation.
  291. $shift = ($mState - 1) * 6;
  292. $tmp = $in;
  293. $tmp = ($tmp & 0x0000003F) << $shift;
  294. $mUcs4 |= $tmp;
  295. /**
  296. * End of the multi-octet sequence. mUcs4 now contains the final
  297. * Unicode codepoint to be output
  298. */
  299. if (0 == --$mState) {
  300. // From Unicode 3.1, non-shortest form is illegal
  301. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  302. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  303. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
  304. return UTF8_BAD_NONSHORT;
  305. // From Unicode 3.2, surrogate characters are illegal
  306. } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
  307. return UTF8_BAD_SURROGATE;
  308. // Codepoints outside the Unicode range are illegal
  309. } else if ($mUcs4 > 0x10FFFF) {
  310. return UTF8_BAD_UNIOUTRANGE;
  311. }
  312. //initialize UTF8 cache
  313. $mState = 0;
  314. $mUcs4 = 0;
  315. $mBytes = 1;
  316. }
  317. } else {
  318. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  319. // Incomplete multi-octet sequence.
  320. $i--;
  321. return UTF8_BAD_SEQINCOMPLETE;
  322. }
  323. }
  324. }
  325. if ( $mState != 0 ) {
  326. // Incomplete multi-octet sequence.
  327. $i--;
  328. return UTF8_BAD_SEQINCOMPLETE;
  329. }
  330. // No bad octets found
  331. $i = NULL;
  332. return FALSE;
  333. }
  334. //--------------------------------------------------------------------
  335. /**
  336. * Takes a return code from utf8_bad_identify() are returns a message
  337. * (in English) explaining what the problem is.
  338. * @param int return code from utf8_bad_identify
  339. * @return mixed string message or FALSE if return code unknown
  340. * @see utf8_bad_identify
  341. * @package utf8
  342. * @subpackage bad
  343. */
  344. function utf8_bad_explain($code) {
  345. switch ($code) {
  346. case UTF8_BAD_5OCTET:
  347. return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
  348. break;
  349. case UTF8_BAD_6OCTET:
  350. return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
  351. break;
  352. case UTF8_BAD_SEQID:
  353. return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
  354. break;
  355. case UTF8_BAD_NONSHORT:
  356. return 'From Unicode 3.1, non-shortest form is illegal';
  357. break;
  358. case UTF8_BAD_SURROGATE:
  359. return 'From Unicode 3.2, surrogate characters are illegal';
  360. break;
  361. case UTF8_BAD_UNIOUTRANGE:
  362. return 'Codepoints outside the Unicode range are illegal';
  363. break;
  364. case UTF8_BAD_SEQINCOMPLETE:
  365. return 'Incomplete multi-octet sequence';
  366. break;
  367. }
  368. trigger_error('Unknown error code: '.$code,E_USER_WARNING);
  369. return FALSE;
  370. }