PageRenderTime 64ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/contrib/utf8/utils/validation.php

https://github.com/matthiask/swisdk2
PHP | 185 lines | 71 code | 26 blank | 88 comment | 38 complexity | 408ba98e8784cc1277ebd23e973d335e MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @version $Id: validation.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
  4. * Tools for validing a UTF-8 string is well formed.
  5. * The Original Code is Mozilla Communicator client code.
  6. * The Initial Developer of the Original Code is
  7. * Netscape Communications Corporation.
  8. * Portions created by the Initial Developer are Copyright (C) 1998
  9. * the Initial Developer. All Rights Reserved.
  10. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14. * @see http://hsivonen.iki.fi/php-utf8/
  15. * @package utf8
  16. * @subpackage validation
  17. */
  18. //--------------------------------------------------------------------
  19. /**
  20. * Tests a string as to whether it's valid UTF-8 and supported by the
  21. * Unicode standard
  22. * Note: this function has been modified to simple return true or false
  23. * @author <hsivonen@iki.fi>
  24. * @param string UTF-8 encoded string
  25. * @return boolean true if valid
  26. * @see http://hsivonen.iki.fi/php-utf8/
  27. * @see utf8_compliant
  28. * @package utf8
  29. * @subpackage validation
  30. */
  31. function utf8_is_valid($str) {
  32. $mState = 0; // cached expected number of octets after the current octet
  33. // until the beginning of the next UTF8 character sequence
  34. $mUcs4 = 0; // cached Unicode character
  35. $mBytes = 1; // cached expected number of octets in the current sequence
  36. $len = strlen($str);
  37. for($i = 0; $i < $len; $i++) {
  38. $in = ord($str{$i});
  39. if ( $mState == 0) {
  40. // When mState is zero we expect either a US-ASCII character or a
  41. // multi-octet sequence.
  42. if (0 == (0x80 & ($in))) {
  43. // US-ASCII, pass straight through.
  44. $mBytes = 1;
  45. } else if (0xC0 == (0xE0 & ($in))) {
  46. // First octet of 2 octet sequence
  47. $mUcs4 = ($in);
  48. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  49. $mState = 1;
  50. $mBytes = 2;
  51. } else if (0xE0 == (0xF0 & ($in))) {
  52. // First octet of 3 octet sequence
  53. $mUcs4 = ($in);
  54. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  55. $mState = 2;
  56. $mBytes = 3;
  57. } else if (0xF0 == (0xF8 & ($in))) {
  58. // First octet of 4 octet sequence
  59. $mUcs4 = ($in);
  60. $mUcs4 = ($mUcs4 & 0x07) << 18;
  61. $mState = 3;
  62. $mBytes = 4;
  63. } else if (0xF8 == (0xFC & ($in))) {
  64. /* First octet of 5 octet sequence.
  65. *
  66. * This is illegal because the encoded codepoint must be either
  67. * (a) not the shortest form or
  68. * (b) outside the Unicode range of 0-0x10FFFF.
  69. * Rather than trying to resynchronize, we will carry on until the end
  70. * of the sequence and let the later error handling code catch it.
  71. */
  72. $mUcs4 = ($in);
  73. $mUcs4 = ($mUcs4 & 0x03) << 24;
  74. $mState = 4;
  75. $mBytes = 5;
  76. } else if (0xFC == (0xFE & ($in))) {
  77. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  78. $mUcs4 = ($in);
  79. $mUcs4 = ($mUcs4 & 1) << 30;
  80. $mState = 5;
  81. $mBytes = 6;
  82. } else {
  83. /* Current octet is neither in the US-ASCII range nor a legal first
  84. * octet of a multi-octet sequence.
  85. */
  86. return FALSE;
  87. }
  88. } else {
  89. // When mState is non-zero, we expect a continuation of the multi-octet
  90. // sequence
  91. if (0x80 == (0xC0 & ($in))) {
  92. // Legal continuation.
  93. $shift = ($mState - 1) * 6;
  94. $tmp = $in;
  95. $tmp = ($tmp & 0x0000003F) << $shift;
  96. $mUcs4 |= $tmp;
  97. /**
  98. * End of the multi-octet sequence. mUcs4 now contains the final
  99. * Unicode codepoint to be output
  100. */
  101. if (0 == --$mState) {
  102. /*
  103. * Check for illegal sequences and codepoints.
  104. */
  105. // From Unicode 3.1, non-shortest form is illegal
  106. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  107. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  108. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  109. (4 < $mBytes) ||
  110. // From Unicode 3.2, surrogate characters are illegal
  111. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  112. // Codepoints outside the Unicode range are illegal
  113. ($mUcs4 > 0x10FFFF)) {
  114. return FALSE;
  115. }
  116. //initialize UTF8 cache
  117. $mState = 0;
  118. $mUcs4 = 0;
  119. $mBytes = 1;
  120. }
  121. } else {
  122. /**
  123. *((0xC0 & (*in) != 0x80) && (mState != 0))
  124. * Incomplete multi-octet sequence.
  125. */
  126. return FALSE;
  127. }
  128. }
  129. }
  130. return TRUE;
  131. }
  132. //--------------------------------------------------------------------
  133. /**
  134. * Tests whether a string complies as UTF-8. This will be much
  135. * faster than utf8_is_valid but will pass five and six octet
  136. * UTF-8 sequences, which are not supported by Unicode and
  137. * so cannot be displayed correctly in a browser. In other words
  138. * it is not as strict as utf8_is_valid but it's faster. If you use
  139. * is to validate user input, you place yourself at the risk that
  140. * attackers will be able to inject 5 and 6 byte sequences (which
  141. * may or may not be a significant risk, depending on what you are
  142. * are doing)
  143. * @see utf8_is_valid
  144. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  145. * @param string UTF-8 string to check
  146. * @return boolean TRUE if string is valid UTF-8
  147. * @package utf8
  148. * @subpackage validation
  149. */
  150. function utf8_compliant($str) {
  151. if ( strlen($str) == 0 ) {
  152. return TRUE;
  153. }
  154. // If even just the first character can be matched, when the /u
  155. // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  156. // invalid, nothing at all will match, even if the string contains
  157. // some valid sequences
  158. return (preg_match('/^.{1}/us',$str,$ar) == 1);
  159. }