PageRenderTime 46ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/MantisBT/library/utf8/exp/regexunicode.php

https://bitbucket.org/crypticrod/sr_wp_code
PHP | 37 lines | 29 code | 2 blank | 6 comment | 11 complexity | a33979c1a26120bc0293368c29efc9e3 MD5 | raw file
Possible License(s): AGPL-1.0, GPL-2.0, LGPL-2.1, GPL-3.0, LGPL-2.0, AGPL-3.0
  1. <?php
  2. /**
  3. * This was an experiment to see how a PCRE based UTF-8 to unicode
  4. * code point converter would perform, vs. a character by character
  5. * converted (as in '../utf8_unicode.php'). Basically this is very
  6. * by comparion but perhaps interesting code anyway
  7. */
  8. $UTF8_MATCH =
  9. '([\x09\x0A\x0D\x20-\x7E])'. # ASCII (excluding control chars)
  10. '|([\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
  11. '|\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs
  12. '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
  13. '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
  14. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
  15. '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
  16. '|\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16
  17. '|(.{1})'; # catch bad bytes
  18. function toCodePoint($matches) {
  19. global $points;
  20. if ( $matches[1] != '' ) {
  21. $points[]= ord($matches[1]);
  22. } else if ( $matches[2] != '' ) {
  23. $points[]= ( ( ord($matches[2][0]) % 32 ) * 64 ) + ( ord($matches[2][1]) % 64 );
  24. } else if ( $matches[3] != '' ) {
  25. $points[]= ( ( ord($matches[3][0]) % 16 ) * 4096 ) + ( ( ord($matches[3][1]) % 64 ) * 64 ) + ( ord($matches[3][2]) % 64 );
  26. } else if ( $matches[4] != '' ) {
  27. trigger_error('Invalid byte in UTF-8',E_USER_WARNING);
  28. return '';
  29. }
  30. return $matches[0];
  31. }
  32. $str = file_get_contents('../tests/data/utf8.html');
  33. $points = array();
  34. preg_replace_callback('/'.$UTF8_MATCH.'/S','toCodePoint',$str);
  35. print_r($points);