PageRenderTime 45ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/phputf8/utils/validation.php

https://github.com/dianaprajescu/joomla-framework
PHP | 181 lines | 71 code | 26 blank | 84 comment | 38 complexity | 5d968ac8cf4a5d611be340ac6381bbf9 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * Tools for validing a UTF-8 string is well formed.
  4. * The Original Code is Mozilla Communicator client code.
  5. * The Initial Developer of the Original Code is
  6. * Netscape Communications Corporation.
  7. * Portions created by the Initial Developer are Copyright (C) 1998
  8. * the Initial Developer. All Rights Reserved.
  9. * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  10. * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  11. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  12. * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  13. * @see http://hsivonen.iki.fi/php-utf8/
  14. * @package utf8
  15. */
  16. //--------------------------------------------------------------------
  17. /**
  18. * Tests a string as to whether it's valid UTF-8 and supported by the
  19. * Unicode standard
  20. * Note: this function has been modified to simple return true or false
  21. * @author <hsivonen@iki.fi>
  22. * @param string UTF-8 encoded string
  23. * @return boolean true if valid
  24. * @see http://hsivonen.iki.fi/php-utf8/
  25. * @see utf8_compliant
  26. * @package utf8
  27. */
  28. function utf8_is_valid($str) {
  29. $mState = 0; // cached expected number of octets after the current octet
  30. // until the beginning of the next UTF8 character sequence
  31. $mUcs4 = 0; // cached Unicode character
  32. $mBytes = 1; // cached expected number of octets in the current sequence
  33. $len = strlen($str);
  34. for($i = 0; $i < $len; $i++) {
  35. $in = ord($str{$i});
  36. if ( $mState == 0) {
  37. // When mState is zero we expect either a US-ASCII character or a
  38. // multi-octet sequence.
  39. if (0 == (0x80 & ($in))) {
  40. // US-ASCII, pass straight through.
  41. $mBytes = 1;
  42. } else if (0xC0 == (0xE0 & ($in))) {
  43. // First octet of 2 octet sequence
  44. $mUcs4 = ($in);
  45. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  46. $mState = 1;
  47. $mBytes = 2;
  48. } else if (0xE0 == (0xF0 & ($in))) {
  49. // First octet of 3 octet sequence
  50. $mUcs4 = ($in);
  51. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  52. $mState = 2;
  53. $mBytes = 3;
  54. } else if (0xF0 == (0xF8 & ($in))) {
  55. // First octet of 4 octet sequence
  56. $mUcs4 = ($in);
  57. $mUcs4 = ($mUcs4 & 0x07) << 18;
  58. $mState = 3;
  59. $mBytes = 4;
  60. } else if (0xF8 == (0xFC & ($in))) {
  61. /* First octet of 5 octet sequence.
  62. *
  63. * This is illegal because the encoded codepoint must be either
  64. * (a) not the shortest form or
  65. * (b) outside the Unicode range of 0-0x10FFFF.
  66. * Rather than trying to resynchronize, we will carry on until the end
  67. * of the sequence and let the later error handling code catch it.
  68. */
  69. $mUcs4 = ($in);
  70. $mUcs4 = ($mUcs4 & 0x03) << 24;
  71. $mState = 4;
  72. $mBytes = 5;
  73. } else if (0xFC == (0xFE & ($in))) {
  74. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  75. $mUcs4 = ($in);
  76. $mUcs4 = ($mUcs4 & 1) << 30;
  77. $mState = 5;
  78. $mBytes = 6;
  79. } else {
  80. /* Current octet is neither in the US-ASCII range nor a legal first
  81. * octet of a multi-octet sequence.
  82. */
  83. return FALSE;
  84. }
  85. } else {
  86. // When mState is non-zero, we expect a continuation of the multi-octet
  87. // sequence
  88. if (0x80 == (0xC0 & ($in))) {
  89. // Legal continuation.
  90. $shift = ($mState - 1) * 6;
  91. $tmp = $in;
  92. $tmp = ($tmp & 0x0000003F) << $shift;
  93. $mUcs4 |= $tmp;
  94. /**
  95. * End of the multi-octet sequence. mUcs4 now contains the final
  96. * Unicode codepoint to be output
  97. */
  98. if (0 == --$mState) {
  99. /*
  100. * Check for illegal sequences and codepoints.
  101. */
  102. // From Unicode 3.1, non-shortest form is illegal
  103. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  104. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  105. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  106. (4 < $mBytes) ||
  107. // From Unicode 3.2, surrogate characters are illegal
  108. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  109. // Codepoints outside the Unicode range are illegal
  110. ($mUcs4 > 0x10FFFF)) {
  111. return FALSE;
  112. }
  113. //initialize UTF8 cache
  114. $mState = 0;
  115. $mUcs4 = 0;
  116. $mBytes = 1;
  117. }
  118. } else {
  119. /**
  120. *((0xC0 & (*in) != 0x80) && (mState != 0))
  121. * Incomplete multi-octet sequence.
  122. */
  123. return FALSE;
  124. }
  125. }
  126. }
  127. return TRUE;
  128. }
  129. //--------------------------------------------------------------------
  130. /**
  131. * Tests whether a string complies as UTF-8. This will be much
  132. * faster than utf8_is_valid but will pass five and six octet
  133. * UTF-8 sequences, which are not supported by Unicode and
  134. * so cannot be displayed correctly in a browser. In other words
  135. * it is not as strict as utf8_is_valid but it's faster. If you use
  136. * is to validate user input, you place yourself at the risk that
  137. * attackers will be able to inject 5 and 6 byte sequences (which
  138. * may or may not be a significant risk, depending on what you are
  139. * are doing)
  140. * @see utf8_is_valid
  141. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  142. * @param string UTF-8 string to check
  143. * @return boolean TRUE if string is valid UTF-8
  144. * @package utf8
  145. */
  146. function utf8_compliant($str) {
  147. if ( strlen($str) == 0 ) {
  148. return TRUE;
  149. }
  150. // If even just the first character can be matched, when the /u
  151. // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  152. // invalid, nothing at all will match, even if the string contains
  153. // some valid sequences
  154. return (preg_match('/^.{1}/us',$str,$ar) == 1);
  155. }