PageRenderTime 38ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/phputf8/utils/position.php

https://bitbucket.org/eternaware/joomus
PHP | 173 lines | 59 code | 35 blank | 79 comment | 27 complexity | 84170318c0530f9822574e558de54fb2 MD5 | raw file
Possible License(s): LGPL-2.1
  1. <?php
  2. /**
  3. * Locate a byte index given a UTF-8 character index
  4. * @version $Id$
  5. * @package utf8
  6. * @subpackage position
  7. */
  8. //--------------------------------------------------------------------
  9. /**
  10. * Given a string and a character index in the string, in
  11. * terms of the UTF-8 character position, returns the byte
  12. * index of that character. Can be useful when you want to
  13. * PHP's native string functions but we warned, locating
  14. * the byte can be expensive
  15. * Takes variable number of parameters - first must be
  16. * the search string then 1 to n UTF-8 character positions
  17. * to obtain byte indexes for - it is more efficient to search
  18. * the string for multiple characters at once, than make
  19. * repeated calls to this function
  20. *
  21. * @author Chris Smith<chris@jalakai.co.uk>
  22. * @param string string to locate index in
  23. * @param int (n times)
  24. * @return mixed - int if only one input int, array if more
  25. * @return boolean TRUE if it's all ASCII
  26. * @package utf8
  27. * @subpackage position
  28. */
  29. function utf8_byte_position() {
  30. $args = func_get_args();
  31. $str =& array_shift($args);
  32. if (!is_string($str)) return false;
  33. $result = array();
  34. // trivial byte index, character offset pair
  35. $prev = array(0,0);
  36. // use a short piece of str to estimate bytes per character
  37. // $i (& $j) -> byte indexes into $str
  38. $i = utf8_locate_next_chr($str, 300);
  39. // $c -> character offset into $str
  40. $c = strlen(utf8_decode(substr($str,0,$i)));
  41. // deal with arguments from lowest to highest
  42. sort($args);
  43. foreach ($args as $offset) {
  44. // sanity checks FIXME
  45. // 0 is an easy check
  46. if ($offset == 0) { $result[] = 0; continue; }
  47. // ensure no endless looping
  48. $safety_valve = 50;
  49. do {
  50. if ( ($c - $prev[1]) == 0 ) {
  51. // Hack: gone past end of string
  52. $error = 0;
  53. $i = strlen($str);
  54. break;
  55. }
  56. $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
  57. // correct to utf8 character boundary
  58. $j = utf8_locate_next_chr($str, $j);
  59. // save the index, offset for use next iteration
  60. $prev = array($i,$c);
  61. if ($j > $i) {
  62. // determine new character offset
  63. $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
  64. } else {
  65. // ditto
  66. $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
  67. }
  68. $error = abs($c-$offset);
  69. // ready for next time around
  70. $i = $j;
  71. // from 7 it is faster to iterate over the string
  72. } while ( ($error > 7) && --$safety_valve) ;
  73. if ($error && $error <= 7) {
  74. if ($c < $offset) {
  75. // move up
  76. while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
  77. } else {
  78. // move down
  79. while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
  80. }
  81. // ready for next arg
  82. $c = $offset;
  83. }
  84. $result[] = $i;
  85. }
  86. if ( count($result) == 1 ) {
  87. return $result[0];
  88. }
  89. return $result;
  90. }
  91. //--------------------------------------------------------------------
  92. /**
  93. * Given a string and any byte index, returns the byte index
  94. * of the start of the current UTF-8 character, relative to supplied
  95. * position. If the current character begins at the same place as the
  96. * supplied byte index, that byte index will be returned. Otherwise
  97. * this function will step backwards, looking for the index where
  98. * curent UTF-8 character begins
  99. * @author Chris Smith<chris@jalakai.co.uk>
  100. * @param string
  101. * @param int byte index in the string
  102. * @return int byte index of start of next UTF-8 character
  103. * @package utf8
  104. * @subpackage position
  105. */
  106. function utf8_locate_current_chr( &$str, $idx ) {
  107. if ($idx <= 0) return 0;
  108. $limit = strlen($str);
  109. if ($idx >= $limit) return $limit;
  110. // Binary value for any byte after the first in a multi-byte UTF-8 character
  111. // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
  112. // of byte - assuming well formed UTF-8
  113. while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
  114. return $idx;
  115. }
  116. //--------------------------------------------------------------------
  117. /**
  118. * Given a string and any byte index, returns the byte index
  119. * of the start of the next UTF-8 character, relative to supplied
  120. * position. If the next character begins at the same place as the
  121. * supplied byte index, that byte index will be returned.
  122. * @author Chris Smith<chris@jalakai.co.uk>
  123. * @param string
  124. * @param int byte index in the string
  125. * @return int byte index of start of next UTF-8 character
  126. * @package utf8
  127. * @subpackage position
  128. */
  129. function utf8_locate_next_chr( &$str, $idx ) {
  130. if ($idx <= 0) return 0;
  131. $limit = strlen($str);
  132. if ($idx >= $limit) return $limit;
  133. // Binary value for any byte after the first in a multi-byte UTF-8 character
  134. // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
  135. // of byte - assuming well formed UTF-8
  136. while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
  137. return $idx;
  138. }