PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/phputf8/utf8_to_ascii.php

https://github.com/michaeljoyce/pkp-lib
PHP | 145 lines | 66 code | 45 blank | 34 comment | 22 complexity | 488ed3a1f840b302d76bf9964196e78a MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause
  1. <?php
  2. /**
  3. * US-ASCII transliterations of Unicode text
  4. * @version $Id$
  5. * @package utf8_to_ascii
  6. */
  7. if ( !defined('UTF8_TO_ASCII_DB') ) {
  8. define('UTF8_TO_ASCII_DB',dirname(__FILE__).'/db');
  9. }
  10. //--------------------------------------------------------------------
  11. /**
  12. * US-ASCII transliterations of Unicode text
  13. * Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!)
  14. * Warning: you should only pass this well formed UTF-8!
  15. * Be aware it works by making a copy of the input string which it appends transliterated
  16. * characters to - it uses a PHP output buffer to do this - it means, memory use will increase,
  17. * requiring up to the same amount again as the input string
  18. * @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
  19. * @param string UTF-8 string to convert
  20. * @param string (default = ?) Character use if character unknown
  21. * @return string US-ASCII string
  22. * @package utf8_to_ascii
  23. */
  24. function utf8_to_ascii($str, $unknown = '?') {
  25. # The database for transliteration stored here
  26. static $UTF8_TO_ASCII = array();
  27. # Variable lookups faster than accessing constants
  28. $UTF8_TO_ASCII_DB = UTF8_TO_ASCII_DB;
  29. if ( strlen($str) == 0 ) { return ''; }
  30. $len = strlen($str);
  31. $i = 0;
  32. # Use an output buffer to copy the transliterated string
  33. # This is done for performance vs. string concatenation - on my system, drops
  34. # the average request time for the example from ~0.46ms to 0.41ms
  35. # See http://phplens.com/lens/php-book/optimizing-debugging-php.php
  36. # Section "High Return Code Optimizations"
  37. ob_start();
  38. while ( $i < $len ) {
  39. $ord = NULL;
  40. $increment = 1;
  41. $ord0 = ord($str{$i});
  42. # Much nested if /else - PHP fn calls expensive, no block scope...
  43. # 1 byte - ASCII
  44. if ( $ord0 >= 0 && $ord0 <= 127 ) {
  45. $ord = $ord0;
  46. $increment = 1;
  47. } else {
  48. # 2 bytes
  49. $ord1 = ord($str{$i+1});
  50. if ( $ord0 >= 192 && $ord0 <= 223 ) {
  51. $ord = ( $ord0 - 192 ) * 64 + ( $ord1 - 128 );
  52. $increment = 2;
  53. } else {
  54. # 3 bytes
  55. $ord2 = ord($str{$i+2});
  56. if ( $ord0 >= 224 && $ord0 <= 239 ) {
  57. $ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128);
  58. $increment = 3;
  59. } else {
  60. # 4 bytes
  61. $ord3 = ord($str{$i+3});
  62. if ($ord0>=240 && $ord0<=247) {
  63. $ord = ($ord0-240)*262144 + ($ord1-128)*4096
  64. + ($ord2-128)*64 + ($ord3-128);
  65. $increment = 4;
  66. } else {
  67. ob_end_clean();
  68. trigger_error("utf8_to_ascii: looks like badly formed UTF-8 at byte $i");
  69. return FALSE;
  70. }
  71. }
  72. }
  73. }
  74. $bank = $ord >> 8;
  75. # If we haven't used anything from this bank before, need to load it...
  76. if ( !array_key_exists($bank, $UTF8_TO_ASCII) ) {
  77. $bankfile = UTF8_TO_ASCII_DB. '/'. sprintf("x%02x",$bank).'.php';
  78. if ( file_exists($bankfile) ) {
  79. # Load the appropriate database
  80. if ( !include $bankfile ) {
  81. ob_end_clean();
  82. trigger_error("utf8_to_ascii: unable to load $bankfile");
  83. }
  84. } else {
  85. # Some banks are deliberately empty
  86. $UTF8_TO_ASCII[$bank] = array();
  87. }
  88. }
  89. $newchar = $ord & 255;
  90. if ( array_key_exists($newchar, $UTF8_TO_ASCII[$bank]) ) {
  91. echo $UTF8_TO_ASCII[$bank][$newchar];
  92. } else {
  93. echo $unknown;
  94. }
  95. $i += $increment;
  96. }
  97. $str = ob_get_contents();
  98. ob_end_clean();
  99. return $str;
  100. }