PageRenderTime 25ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/php/plugins/utf8/utils/bad.php

https://bitbucket.org/chiamingyen/cmsimple-and-plugins
PHP | 395 lines | 195 code | 51 blank | 149 comment | 52 complexity | 1f0196826a0964fa1c24cd3d5b86ae0b MD5 | raw file
  1. <?php
  2. /**
  3. * @version $Id: bad.php 4 2012-08-07 21:08:48Z cmb69 $
  4. * Tools for locating / replacing bad bytes in UTF-8 strings
  5. * The Original Code is Mozilla Communicator client code.
  6. * The Initial Developer of the Original Code is
  7. * Netscape Communications Corporation.
  8. * Portions created by the Initial Developer are Copyright (C) 1998
  9. * the Initial Developer. All Rights Reserved.
  10. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14. * @see http://hsivonen.iki.fi/php-utf8/
  15. * @package utf8
  16. * @subpackage bad
  17. * @see utf8_is_valid
  18. */
  19. //--------------------------------------------------------------------
  20. /**
  21. * Locates the first bad byte in a UTF-8 string returning it's
  22. * byte index in the string
  23. * PCRE Pattern to locate bad bytes in a UTF-8 string
  24. * Comes from W3 FAQ: Multilingual Forms
  25. * Note: modified to include full ASCII range including control chars
  26. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  27. * @param string
  28. * @return mixed integer byte index or FALSE if no bad found
  29. */
  30. function utf8_bad_find($str) {
  31. $UTF8_BAD =
  32. '([\x00-\x7F]'. # ASCII (including control chars)
  33. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  34. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  35. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  36. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  37. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  38. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  39. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  40. '|(.{1}))'; # invalid byte
  41. $pos = 0;
  42. $badList = array();
  43. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  44. $bytes = strlen($matches[0]);
  45. if ( isset($matches[2])) {
  46. return $pos;
  47. }
  48. $pos += $bytes;
  49. $str = substr($str,$bytes);
  50. }
  51. return FALSE;
  52. }
  53. //--------------------------------------------------------------------
  54. /**
  55. * Locates all bad bytes in a UTF-8 string and returns a list of their
  56. * byte index in the string
  57. * PCRE Pattern to locate bad bytes in a UTF-8 string
  58. * Comes from W3 FAQ: Multilingual Forms
  59. * Note: modified to include full ASCII range including control chars
  60. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  61. * @param string
  62. * @return mixed array of integers or FALSE if no bad found
  63. */
  64. function utf8_bad_findall($str) {
  65. $UTF8_BAD =
  66. '([\x00-\x7F]'. # ASCII (including control chars)
  67. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  68. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  69. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  70. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  71. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  72. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  73. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  74. '|(.{1}))'; # invalid byte
  75. $pos = 0;
  76. $badList = array();
  77. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  78. $bytes = strlen($matches[0]);
  79. if ( isset($matches[2])) {
  80. $badList[] = $pos;
  81. }
  82. $pos += $bytes;
  83. $str = substr($str,$bytes);
  84. }
  85. if ( count($badList) > 0 ) {
  86. return $badList;
  87. }
  88. return FALSE;
  89. }
  90. //--------------------------------------------------------------------
  91. /**
  92. * Strips out any bad bytes from a UTF-8 string and returns the rest
  93. * PCRE Pattern to locate bad bytes in a UTF-8 string
  94. * Comes from W3 FAQ: Multilingual Forms
  95. * Note: modified to include full ASCII range including control chars
  96. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  97. * @param string
  98. * @return string
  99. */
  100. function utf8_bad_strip($str) {
  101. $UTF8_BAD =
  102. '([\x00-\x7F]'. # ASCII (including control chars)
  103. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  104. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  105. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  106. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  107. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  108. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  109. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  110. '|(.{1}))'; # invalid byte
  111. ob_start();
  112. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  113. if ( !isset($matches[2])) {
  114. echo $matches[0];
  115. }
  116. $str = substr($str,strlen($matches[0]));
  117. }
  118. $result = ob_get_contents();
  119. ob_end_clean();
  120. return $result;
  121. }
  122. //--------------------------------------------------------------------
  123. /**
  124. * Replace bad bytes with an alternative character - ASCII character
  125. * recommended is replacement char
  126. * PCRE Pattern to locate bad bytes in a UTF-8 string
  127. * Comes from W3 FAQ: Multilingual Forms
  128. * Note: modified to include full ASCII range including control chars
  129. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  130. * @param string to search
  131. * @param string to replace bad bytes with (defaults to '?') - use ASCII
  132. * @return string
  133. */
  134. function utf8_bad_replace($str, $replace = '?') {
  135. $UTF8_BAD =
  136. '([\x00-\x7F]'. # ASCII (including control chars)
  137. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  138. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  139. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  140. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  141. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  142. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  143. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  144. '|(.{1}))'; # invalid byte
  145. ob_start();
  146. while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  147. if ( !isset($matches[2])) {
  148. echo $matches[0];
  149. } else {
  150. echo $replace;
  151. }
  152. $str = substr($str,strlen($matches[0]));
  153. }
  154. $result = ob_get_contents();
  155. ob_end_clean();
  156. return $result;
  157. }
  158. //--------------------------------------------------------------------
  159. /**
  160. * Return code from utf8_bad_identify() when a five octet sequence is detected.
  161. * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
  162. * do not represent a useful character
  163. * @see utf8_bad_identify
  164. */
  165. define('UTF8_BAD_5OCTET',1);
  166. /**
  167. * Return code from utf8_bad_identify() when a six octet sequence is detected.
  168. * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
  169. * do not represent a useful character
  170. * @see utf8_bad_identify
  171. */
  172. define('UTF8_BAD_6OCTET',2);
  173. /**
  174. * Return code from utf8_bad_identify().
  175. * Invalid octet for use as start of multi-byte UTF-8 sequence
  176. * @see utf8_bad_identify
  177. */
  178. define('UTF8_BAD_SEQID',3);
  179. /**
  180. * Return code from utf8_bad_identify().
  181. * From Unicode 3.1, non-shortest form is illegal
  182. * @see utf8_bad_identify
  183. */
  184. define('UTF8_BAD_NONSHORT',4);
  185. /**
  186. * Return code from utf8_bad_identify().
  187. * From Unicode 3.2, surrogate characters are illegal
  188. * @see utf8_bad_identify
  189. */
  190. define('UTF8_BAD_SURROGATE',5);
  191. /**
  192. * Return code from utf8_bad_identify().
  193. * Codepoints outside the Unicode range are illegal
  194. * @see utf8_bad_identify
  195. */
  196. define('UTF8_BAD_UNIOUTRANGE',6);
  197. /**
  198. * Return code from utf8_bad_identify().
  199. * Incomplete multi-octet sequence
  200. * Note: this is kind of a "catch-all"
  201. * @see utf8_bad_identify
  202. */
  203. define('UTF8_BAD_SEQINCOMPLETE',7);
  204. //--------------------------------------------------------------------
  205. /**
  206. * Reports on the type of bad byte found in a UTF-8 string. Returns a
  207. * status code on the first bad byte found
  208. * @author <hsivonen@iki.fi>
  209. * @param string UTF-8 encoded string
  210. * @return mixed integer constant describing problem or FALSE if valid UTF-8
  211. * @see utf8_bad_explain
  212. * @see http://hsivonen.iki.fi/php-utf8/
  213. */
  214. function utf8_bad_identify($str, &$i) {
  215. $mState = 0; // cached expected number of octets after the current octet
  216. // until the beginning of the next UTF8 character sequence
  217. $mUcs4 = 0; // cached Unicode character
  218. $mBytes = 1; // cached expected number of octets in the current sequence
  219. $len = strlen($str);
  220. for($i = 0; $i < $len; $i++) {
  221. $in = ord($str{$i});
  222. if ( $mState == 0) {
  223. // When mState is zero we expect either a US-ASCII character or a
  224. // multi-octet sequence.
  225. if (0 == (0x80 & ($in))) {
  226. // US-ASCII, pass straight through.
  227. $mBytes = 1;
  228. } else if (0xC0 == (0xE0 & ($in))) {
  229. // First octet of 2 octet sequence
  230. $mUcs4 = ($in);
  231. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  232. $mState = 1;
  233. $mBytes = 2;
  234. } else if (0xE0 == (0xF0 & ($in))) {
  235. // First octet of 3 octet sequence
  236. $mUcs4 = ($in);
  237. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  238. $mState = 2;
  239. $mBytes = 3;
  240. } else if (0xF0 == (0xF8 & ($in))) {
  241. // First octet of 4 octet sequence
  242. $mUcs4 = ($in);
  243. $mUcs4 = ($mUcs4 & 0x07) << 18;
  244. $mState = 3;
  245. $mBytes = 4;
  246. } else if (0xF8 == (0xFC & ($in))) {
  247. /* First octet of 5 octet sequence.
  248. *
  249. * This is illegal because the encoded codepoint must be either
  250. * (a) not the shortest form or
  251. * (b) outside the Unicode range of 0-0x10FFFF.
  252. */
  253. return UTF8_BAD_5OCTET;
  254. } else if (0xFC == (0xFE & ($in))) {
  255. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  256. return UTF8_BAD_6OCTET;
  257. } else {
  258. // Current octet is neither in the US-ASCII range nor a legal first
  259. // octet of a multi-octet sequence.
  260. return UTF8_BAD_SEQID;
  261. }
  262. } else {
  263. // When mState is non-zero, we expect a continuation of the multi-octet
  264. // sequence
  265. if (0x80 == (0xC0 & ($in))) {
  266. // Legal continuation.
  267. $shift = ($mState - 1) * 6;
  268. $tmp = $in;
  269. $tmp = ($tmp & 0x0000003F) << $shift;
  270. $mUcs4 |= $tmp;
  271. /**
  272. * End of the multi-octet sequence. mUcs4 now contains the final
  273. * Unicode codepoint to be output
  274. */
  275. if (0 == --$mState) {
  276. // From Unicode 3.1, non-shortest form is illegal
  277. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  278. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  279. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
  280. return UTF8_BAD_NONSHORT;
  281. // From Unicode 3.2, surrogate characters are illegal
  282. } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
  283. return UTF8_BAD_SURROGATE;
  284. // Codepoints outside the Unicode range are illegal
  285. } else if ($mUcs4 > 0x10FFFF) {
  286. return UTF8_BAD_UNIOUTRANGE;
  287. }
  288. //initialize UTF8 cache
  289. $mState = 0;
  290. $mUcs4 = 0;
  291. $mBytes = 1;
  292. }
  293. } else {
  294. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  295. // Incomplete multi-octet sequence.
  296. $i--;
  297. return UTF8_BAD_SEQINCOMPLETE;
  298. }
  299. }
  300. }
  301. if ( $mState != 0 ) {
  302. // Incomplete multi-octet sequence.
  303. $i--;
  304. return UTF8_BAD_SEQINCOMPLETE;
  305. }
  306. // No bad octets found
  307. $i = NULL;
  308. return FALSE;
  309. }
  310. //--------------------------------------------------------------------
  311. /**
  312. * Takes a return code from utf8_bad_identify() are returns a message
  313. * (in English) explaining what the problem is.
  314. * @param int return code from utf8_bad_identify
  315. * @return mixed string message or FALSE if return code unknown
  316. * @see utf8_bad_identify
  317. */
  318. function utf8_bad_explain($code) {
  319. switch ($code) {
  320. case UTF8_BAD_5OCTET:
  321. return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
  322. break;
  323. case UTF8_BAD_6OCTET:
  324. return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
  325. break;
  326. case UTF8_BAD_SEQID:
  327. return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
  328. break;
  329. case UTF8_BAD_NONSHORT:
  330. return 'From Unicode 3.1, non-shortest form is illegal';
  331. break;
  332. case UTF8_BAD_SURROGATE:
  333. return 'From Unicode 3.2, surrogate characters are illegal';
  334. break;
  335. case UTF8_BAD_UNIOUTRANGE:
  336. return 'Codepoints outside the Unicode range are illegal';
  337. break;
  338. case UTF8_BAD_SEQINCOMPLETE:
  339. return 'Incomplete multi-octet sequence';
  340. break;
  341. }
  342. trigger_error('Unknown error code: '.$code,E_USER_WARNING);
  343. return FALSE;
  344. }