PageRenderTime 143ms CodeModel.GetById 100ms app.highlight 6ms RepoModel.GetById 34ms app.codeStats 1ms

/libraries/phputf8/utils/position.php

https://bitbucket.org/asosso/joomla25
PHP | 173 lines | 59 code | 35 blank | 79 comment | 27 complexity | 84170318c0530f9822574e558de54fb2 MD5 | raw file
  1<?php
  2/**
  3* Locate a byte index given a UTF-8 character index
  4* @version $Id$
  5* @package utf8
  6* @subpackage position
  7*/
  8
  9//--------------------------------------------------------------------
 10/**
 11* Given a string and a character index in the string, in
 12* terms of the UTF-8 character position, returns the byte
 13* index of that character. Can be useful when you want to
 14* PHP's native string functions but we warned, locating
 15* the byte can be expensive
 16* Takes variable number of parameters - first must be
 17* the search string then 1 to n UTF-8 character positions
 18* to obtain byte indexes for - it is more efficient to search
 19* the string for multiple characters at once, than make
 20* repeated calls to this function
 21*
 22* @author Chris Smith<chris@jalakai.co.uk>
 23* @param string string to locate index in
 24* @param int (n times)
 25* @return mixed - int if only one input int, array if more
 26* @return boolean TRUE if it's all ASCII
 27* @package utf8
 28* @subpackage position
 29*/
 30function utf8_byte_position() {
 31
 32    $args = func_get_args();
 33    $str =& array_shift($args);
 34    if (!is_string($str)) return false;
 35
 36    $result = array();
 37
 38    // trivial byte index, character offset pair
 39    $prev = array(0,0);
 40
 41    // use a short piece of str to estimate bytes per character
 42    // $i (& $j) -> byte indexes into $str
 43    $i = utf8_locate_next_chr($str, 300);
 44
 45    // $c -> character offset into $str
 46    $c = strlen(utf8_decode(substr($str,0,$i)));
 47
 48    // deal with arguments from lowest to highest
 49    sort($args);
 50
 51    foreach ($args as $offset) {
 52        // sanity checks FIXME
 53
 54        // 0 is an easy check
 55        if ($offset == 0) { $result[] = 0; continue; }
 56
 57        // ensure no endless looping
 58        $safety_valve = 50;
 59
 60        do {
 61
 62            if ( ($c - $prev[1]) == 0 ) {
 63                // Hack: gone past end of string
 64                $error = 0;
 65                $i = strlen($str);
 66                break;
 67            }
 68
 69            $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
 70
 71            // correct to utf8 character boundary
 72            $j = utf8_locate_next_chr($str, $j);
 73
 74            // save the index, offset for use next iteration
 75            $prev = array($i,$c);
 76
 77            if ($j > $i) {
 78                // determine new character offset
 79                $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
 80            } else {
 81                // ditto
 82                $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
 83            }
 84
 85            $error = abs($c-$offset);
 86
 87            // ready for next time around
 88            $i = $j;
 89
 90        // from 7 it is faster to iterate over the string
 91        } while ( ($error > 7) && --$safety_valve) ;
 92
 93        if ($error && $error <= 7) {
 94
 95            if ($c < $offset) {
 96                // move up
 97                while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
 98            } else {
 99                // move down
100                while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
101            }
102
103            // ready for next arg
104            $c = $offset;
105        }
106        $result[] = $i;
107    }
108
109    if ( count($result) == 1 ) {
110        return $result[0];
111    }
112
113    return $result;
114}
115
116//--------------------------------------------------------------------
117/**
118* Given a string and any byte index, returns the byte index
119* of the start of the current UTF-8 character, relative to supplied
120* position. If the current character begins at the same place as the
121* supplied byte index, that byte index will be returned. Otherwise
122* this function will step backwards, looking for the index where
123* curent UTF-8 character begins
124* @author Chris Smith<chris@jalakai.co.uk>
125* @param string
126* @param int byte index in the string
127* @return int byte index of start of next UTF-8 character
128* @package utf8
129* @subpackage position
130*/
131function utf8_locate_current_chr( &$str, $idx ) {
132
133    if ($idx <= 0) return 0;
134
135    $limit = strlen($str);
136    if ($idx >= $limit) return $limit;
137
138    // Binary value for any byte after the first in a multi-byte UTF-8 character
139    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
140    // of byte - assuming well formed UTF-8
141    while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
142
143    return $idx;
144}
145
146//--------------------------------------------------------------------
147/**
148* Given a string and any byte index, returns the byte index
149* of the start of the next UTF-8 character, relative to supplied
150* position. If the next character begins at the same place as the
151* supplied byte index, that byte index will be returned.
152* @author Chris Smith<chris@jalakai.co.uk>
153* @param string
154* @param int byte index in the string
155* @return int byte index of start of next UTF-8 character
156* @package utf8
157* @subpackage position
158*/
159function utf8_locate_next_chr( &$str, $idx ) {
160
161    if ($idx <= 0) return 0;
162
163    $limit = strlen($str);
164    if ($idx >= $limit) return $limit;
165
166    // Binary value for any byte after the first in a multi-byte UTF-8 character
167    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
168    // of byte - assuming well formed UTF-8
169    while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
170
171    return $idx;
172}
173