PageRenderTime 54ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/include/utf8/utils/validation.php

https://bitbucket.org/yoorick/fluxbb.pe
PHP | 186 lines | 92 code | 11 blank | 83 comment | 37 complexity | ef6bd0fb3d66f9a0c440a324cf3c07fe MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @version $Id: validation.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
  4. * Tools for validing a UTF-8 string is well formed.
  5. * The Original Code is Mozilla Communicator client code.
  6. * The Initial Developer of the Original Code is
  7. * Netscape Communications Corporation.
  8. * Portions created by the Initial Developer are Copyright (C) 1998
  9. * the Initial Developer. All Rights Reserved.
  10. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14. * @see http://hsivonen.iki.fi/php-utf8/
  15. * @package utf8
  16. * @subpackage validation
  17. */
  18. /**
  19. * Tests a string as to whether it's valid UTF-8 and supported by the
  20. * Unicode standard
  21. * Note: this function has been modified to simple return true or false
  22. * @author <hsivonen@iki.fi>
  23. * @param string UTF-8 encoded string
  24. * @return boolean true if valid
  25. * @see http://hsivonen.iki.fi/php-utf8/
  26. * @see utf8_compliant
  27. * @package utf8
  28. * @subpackage validation
  29. */
  30. function utf8_is_valid($str)
  31. {
  32. $mState = 0; // Cached expected number of octets after the current octet
  33. // until the beginning of the next UTF8 character sequence
  34. $mUcs4 = 0; // Cached Unicode character
  35. $mBytes = 1; // Cached expected number of octets in the current sequence
  36. $len = strlen($str);
  37. for($i = 0; $i < $len; $i++)
  38. {
  39. $in = ord($str{$i});
  40. if ( $mState == 0)
  41. {
  42. // When mState is zero we expect either a US-ASCII character or a multi-octet sequence.
  43. if (0 == (0x80 & ($in)))
  44. {
  45. $mBytes = 1; // US-ASCII, pass straight through
  46. }
  47. else if (0xC0 == (0xE0 & ($in)))
  48. {
  49. // First octet of 2 octet sequence
  50. $mUcs4 = ($in);
  51. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  52. $mState = 1;
  53. $mBytes = 2;
  54. }
  55. else if (0xE0 == (0xF0 & ($in)))
  56. {
  57. // First octet of 3 octet sequence
  58. $mUcs4 = ($in);
  59. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  60. $mState = 2;
  61. $mBytes = 3;
  62. }
  63. else if (0xF0 == (0xF8 & ($in)))
  64. {
  65. // First octet of 4 octet sequence
  66. $mUcs4 = ($in);
  67. $mUcs4 = ($mUcs4 & 0x07) << 18;
  68. $mState = 3;
  69. $mBytes = 4;
  70. }
  71. else if (0xF8 == (0xFC & ($in)))
  72. {
  73. /* First octet of 5 octet sequence.
  74. *
  75. * This is illegal because the encoded codepoint must be either
  76. * (a) not the shortest form or
  77. * (b) outside the Unicode range of 0-0x10FFFF.
  78. * Rather than trying to resynchronize, we will carry on until the end
  79. * of the sequence and let the later error handling code catch it.
  80. */
  81. $mUcs4 = ($in);
  82. $mUcs4 = ($mUcs4 & 0x03) << 24;
  83. $mState = 4;
  84. $mBytes = 5;
  85. }
  86. else if (0xFC == (0xFE & ($in)))
  87. {
  88. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  89. $mUcs4 = ($in);
  90. $mUcs4 = ($mUcs4 & 1) << 30;
  91. $mState = 5;
  92. $mBytes = 6;
  93. }
  94. else
  95. {
  96. // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
  97. return false;
  98. }
  99. }
  100. else
  101. {
  102. // When mState is non-zero, we expect a continuation of the multi-octet sequence
  103. if (0x80 == (0xC0 & ($in)))
  104. {
  105. // Legal continuation.
  106. $shift = ($mState - 1) * 6;
  107. $tmp = $in;
  108. $tmp = ($tmp & 0x0000003F) << $shift;
  109. $mUcs4 |= $tmp;
  110. /**
  111. * End of the multi-octet sequence. mUcs4 now contains the final
  112. * Unicode codepoint to be output
  113. */
  114. if (0 == --$mState)
  115. {
  116. /*
  117. * Check for illegal sequences and codepoints.
  118. */
  119. // From Unicode 3.1, non-shortest form is illegal
  120. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  121. ((4 == $mBytes) && ($mUcs4 < 0x10000)) || (4 < $mBytes) ||
  122. // From Unicode 3.2, surrogate characters are illegal
  123. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  124. // Codepoints outside the Unicode range are illegal
  125. ($mUcs4 > 0x10FFFF))
  126. {
  127. return FALSE;
  128. }
  129. // Initialize UTF8 cache
  130. $mState = 0;
  131. $mUcs4 = 0;
  132. $mBytes = 1;
  133. }
  134. }
  135. else
  136. {
  137. /**
  138. *((0xC0 & (*in) != 0x80) && (mState != 0))
  139. * Incomplete multi-octet sequence.
  140. */
  141. return false;
  142. }
  143. }
  144. }
  145. return true;
  146. }
  147. /**
  148. * Tests whether a string complies as UTF-8. This will be much
  149. * faster than utf8_is_valid, but will pass five and six octet
  150. * UTF-8 sequences, which are not supported by Unicode and
  151. * so cannot be displayed correctly in a browser. In other words
  152. * it is not as strict as utf8_is_valid but it's faster. If you use
  153. * is to validate user input, you place yourself at the risk that
  154. * attackers will be able to inject 5 and 6 byte sequences (which
  155. * may or may not be a significant risk, depending on what you are
  156. * are doing)
  157. * Note: Does not pass five and six octet UTF-8 sequences anymore in
  158. * in the unit tests.
  159. * @see utf8_is_valid
  160. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  161. * @param string UTF-8 string to check
  162. * @return boolean TRUE if string is valid UTF-8
  163. * @package utf8
  164. * @subpackage validation
  165. */
  166. function utf8_compliant($str)
  167. {
  168. if (strlen($str) == 0)
  169. return true;
  170. // If even just the first character can be matched, when the /u
  171. // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  172. // invalid, nothing at all will match, even if the string contains
  173. // some valid sequences
  174. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  175. }