PageRenderTime 44ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/include/utf8/utils/position.php

https://bitbucket.org/yoorick/fluxbb.pe
PHP | 171 lines | 80 code | 25 blank | 66 comment | 25 complexity | 03eb213f6b767a4f1137c6f9e242e572 MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * Locate a byte index given a UTF-8 character index
  4. * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $
  5. * @package utf8
  6. * @subpackage position
  7. */
  8. /**
  9. * Given a string and a character index in the string, in
  10. * terms of the UTF-8 character position, returns the byte
  11. * index of that character. Can be useful when you want to
  12. * PHP's native string functions but we warned, locating
  13. * the byte can be expensive
  14. * Takes variable number of parameters - first must be
  15. * the search string then 1 to n UTF-8 character positions
  16. * to obtain byte indexes for - it is more efficient to search
  17. * the string for multiple characters at once, than make
  18. * repeated calls to this function
  19. *
  20. * @author Chris Smith<chris@jalakai.co.uk>
  21. * @param string string to locate index in
  22. * @param int (n times)
  23. * @return mixed - int if only one input int, array if more
  24. * @return boolean TRUE if it's all ASCII
  25. * @package utf8
  26. * @subpackage position
  27. */
  28. function utf8_byte_position()
  29. {
  30. $args = func_get_args();
  31. $str =& array_shift($args);
  32. if (!is_string($str))
  33. return false;
  34. $result = array();
  35. $prev = array(0, 0); // Trivial byte index, character offset pair
  36. $i = utf8_locate_next_chr($str, 300); // Use a short piece of str to estimate bytes per character. $i (& $j) -> byte indexes into $str
  37. $c = strlen(utf8_decode(substr($str, 0, $i))); // $c -> character offset into $str
  38. // Deal with arguments from lowest to highest
  39. sort($args);
  40. foreach ($args as $offset)
  41. {
  42. // Sanity checks FIXME
  43. // 0 is an easy check
  44. if ($offset == 0)
  45. {
  46. $result[] = 0; continue;
  47. }
  48. // Ensure no endless looping
  49. $safety_valve = 50;
  50. do
  51. {
  52. if (($c - $prev[1]) == 0)
  53. {
  54. // Hack: gone past end of string
  55. $error = 0;
  56. $i = strlen($str);
  57. break;
  58. }
  59. $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
  60. $j = utf8_locate_next_chr($str, $j); // Correct to utf8 character boundary
  61. $prev = array($i,$c); // Save the index, offset for use next iteration
  62. if ($j > $i)
  63. $c += strlen(utf8_decode(substr($str, $i, $j-$i))); // Determine new character offset
  64. else
  65. $c -= strlen(utf8_decode(substr($str, $j, $i-$j))); // Ditto
  66. $error = abs($c-$offset);
  67. $i = $j; // Ready for next time around
  68. }
  69. while (($error > 7) && --$safety_valve); // From 7 it is faster to iterate over the string
  70. if ($error && $error <= 7)
  71. {
  72. if ($c < $offset)
  73. {
  74. // Move up
  75. while ($error--)
  76. $i = utf8_locate_next_chr($str, ++$i);
  77. }
  78. else
  79. {
  80. // Move down
  81. while ($error--)
  82. $i = utf8_locate_current_chr($str, --$i);
  83. }
  84. // Ready for next arg
  85. $c = $offset;
  86. }
  87. $result[] = $i;
  88. }
  89. if (count($result) == 1)
  90. return $result[0];
  91. return $result;
  92. }
  93. /**
  94. * Given a string and any byte index, returns the byte index
  95. * of the start of the current UTF-8 character, relative to supplied
  96. * position. If the current character begins at the same place as the
  97. * supplied byte index, that byte index will be returned. Otherwise
  98. * this function will step backwards, looking for the index where
  99. * curent UTF-8 character begins
  100. * @author Chris Smith<chris@jalakai.co.uk>
  101. * @param string
  102. * @param int byte index in the string
  103. * @return int byte index of start of next UTF-8 character
  104. * @package utf8
  105. * @subpackage position
  106. */
  107. function utf8_locate_current_chr( &$str, $idx )
  108. {
  109. if ($idx <= 0)
  110. return 0;
  111. $limit = strlen($str);
  112. if ($idx >= $limit)
  113. return $limit;
  114. // Binary value for any byte after the first in a multi-byte UTF-8 character
  115. // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
  116. // of byte - assuming well formed UTF-8
  117. while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80))
  118. $idx--;
  119. return $idx;
  120. }
  121. /**
  122. * Given a string and any byte index, returns the byte index
  123. * of the start of the next UTF-8 character, relative to supplied
  124. * position. If the next character begins at the same place as the
  125. * supplied byte index, that byte index will be returned.
  126. * @author Chris Smith<chris@jalakai.co.uk>
  127. * @param string
  128. * @param int byte index in the string
  129. * @return int byte index of start of next UTF-8 character
  130. * @package utf8
  131. * @subpackage position
  132. */
  133. function utf8_locate_next_chr(&$str, $idx)
  134. {
  135. if ($idx <= 0)
  136. return 0;
  137. $limit = strlen($str);
  138. if ($idx >= $limit)
  139. return $limit;
  140. // Binary value for any byte after the first in a multi-byte UTF-8 character
  141. // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
  142. // of byte - assuming well formed UTF-8
  143. while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80))
  144. $idx++;
  145. return $idx;
  146. }