/libraries/vendor/joomla/string/src/phputf8/utils/patterns.php
https://gitlab.com/vitaliylukin91/idea-rating · PHP · 64 lines · 30 code · 3 blank · 31 comment · 0 complexity · 0a730cb180908e3784f4caf1c017d1a8 MD5 · raw file
- <?php
- /**
- * PCRE Regular expressions for UTF-8. Note this file is not actually used by
- * the rest of the library but these regular expressions can be useful to have
- * available.
- * @see http://www.w3.org/International/questions/qa-forms-utf-8
- * @package utf8
- */
- //--------------------------------------------------------------------
- /**
- * PCRE Pattern to check a UTF-8 string is valid
- * Comes from W3 FAQ: Multilingual Forms
- * Note: modified to include full ASCII range including control chars
- * @see http://www.w3.org/International/questions/qa-forms-utf-8
- * @package utf8
- */
- $UTF8_VALID = '^('.
- '[\x00-\x7F]'. # ASCII (including control chars)
- '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
- '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
- '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
- '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
- '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
- '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
- '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
- ')*$';
- //--------------------------------------------------------------------
- /**
- * PCRE Pattern to match single UTF-8 characters
- * Comes from W3 FAQ: Multilingual Forms
- * Note: modified to include full ASCII range including control chars
- * @see http://www.w3.org/International/questions/qa-forms-utf-8
- * @package utf8
- */
- $UTF8_MATCH =
- '([\x00-\x7F])'. # ASCII (including control chars)
- '|([\xC2-\xDF][\x80-\xBF])'. # non-overlong 2-byte
- '|(\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs
- '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # straight 3-byte
- '|(\xED[\x80-\x9F][\x80-\xBF])'. # excluding surrogates
- '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # planes 1-3
- '|([\xF1-\xF3][\x80-\xBF]{3})'. # planes 4-15
- '|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16
- //--------------------------------------------------------------------
- /**
- * PCRE Pattern to locate bad bytes in a UTF-8 string
- * Comes from W3 FAQ: Multilingual Forms
- * Note: modified to include full ASCII range including control chars
- * @see http://www.w3.org/International/questions/qa-forms-utf-8
- * @package utf8
- */
- $UTF8_BAD =
- '([\x00-\x7F]'. # ASCII (including control chars)
- '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
- '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
- '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
- '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
- '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
- '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
- '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
- '|(.{1}))'; # invalid byte