/libraries/vendor/joomla/string/src/phputf8/utils/patterns.php

https://gitlab.com/vitaliylukin91/idea-rating · PHP · 64 lines · 30 code · 3 blank · 31 comment · 0 complexity · 0a730cb180908e3784f4caf1c017d1a8 MD5 · raw file

  1. <?php
  2. /**
  3. * PCRE Regular expressions for UTF-8. Note this file is not actually used by
  4. * the rest of the library but these regular expressions can be useful to have
  5. * available.
  6. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  7. * @package utf8
  8. */
  9. //--------------------------------------------------------------------
  10. /**
  11. * PCRE Pattern to check a UTF-8 string is valid
  12. * Comes from W3 FAQ: Multilingual Forms
  13. * Note: modified to include full ASCII range including control chars
  14. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  15. * @package utf8
  16. */
  17. $UTF8_VALID = '^('.
  18. '[\x00-\x7F]'. # ASCII (including control chars)
  19. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  20. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  21. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  22. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  23. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  24. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  25. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  26. ')*$';
  27. //--------------------------------------------------------------------
  28. /**
  29. * PCRE Pattern to match single UTF-8 characters
  30. * Comes from W3 FAQ: Multilingual Forms
  31. * Note: modified to include full ASCII range including control chars
  32. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  33. * @package utf8
  34. */
  35. $UTF8_MATCH =
  36. '([\x00-\x7F])'. # ASCII (including control chars)
  37. '|([\xC2-\xDF][\x80-\xBF])'. # non-overlong 2-byte
  38. '|(\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs
  39. '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # straight 3-byte
  40. '|(\xED[\x80-\x9F][\x80-\xBF])'. # excluding surrogates
  41. '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # planes 1-3
  42. '|([\xF1-\xF3][\x80-\xBF]{3})'. # planes 4-15
  43. '|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16
  44. //--------------------------------------------------------------------
  45. /**
  46. * PCRE Pattern to locate bad bytes in a UTF-8 string
  47. * Comes from W3 FAQ: Multilingual Forms
  48. * Note: modified to include full ASCII range including control chars
  49. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  50. * @package utf8
  51. */
  52. $UTF8_BAD =
  53. '([\x00-\x7F]'. # ASCII (including control chars)
  54. '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  55. '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
  56. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  57. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  58. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  59. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  60. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
  61. '|(.{1}))'; # invalid byte