PageRenderTime 108ms CodeModel.GetById 90ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/libraries/phputf8/utils/bad.php

https://bitbucket.org/asosso/joomla25
PHP | 421 lines | 195 code | 51 blank | 175 comment | 52 complexity | 64bf72e82f9bf01e02e69d9293f24325 MD5 | raw file
  1<?php
  2/**
  3* @version $Id$
  4* Tools for locating / replacing bad bytes in UTF-8 strings
  5* The Original Code is Mozilla Communicator client code.
  6* The Initial Developer of the Original Code is
  7* Netscape Communications Corporation.
  8* Portions created by the Initial Developer are Copyright (C) 1998
  9* the Initial Developer. All Rights Reserved.
 10* Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
 11* Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
 12* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
 13* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
 14* @see http://hsivonen.iki.fi/php-utf8/
 15* @package utf8
 16* @subpackage bad
 17* @see utf8_is_valid
 18*/
 19
 20//--------------------------------------------------------------------
 21/**
 22* Locates the first bad byte in a UTF-8 string returning it's
 23* byte index in the string
 24* PCRE Pattern to locate bad bytes in a UTF-8 string
 25* Comes from W3 FAQ: Multilingual Forms
 26* Note: modified to include full ASCII range including control chars
 27* @see http://www.w3.org/International/questions/qa-forms-utf-8
 28* @param string
 29* @return mixed integer byte index or FALSE if no bad found
 30* @package utf8
 31* @subpackage bad
 32*/
 33function utf8_bad_find($str) {
 34    $UTF8_BAD =
 35    '([\x00-\x7F]'.                          # ASCII (including control chars)
 36    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 37    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 38    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 39    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 40    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 41    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 42    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 43    '|(.{1}))';                              # invalid byte
 44    $pos = 0;
 45    $badList = array();
 46    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 47        $bytes = strlen($matches[0]);
 48        if ( isset($matches[2])) {
 49            return $pos;
 50        }
 51        $pos += $bytes;
 52        $str = substr($str,$bytes);
 53    }
 54    return FALSE;
 55}
 56
 57//--------------------------------------------------------------------
 58/**
 59* Locates all bad bytes in a UTF-8 string and returns a list of their
 60* byte index in the string
 61* PCRE Pattern to locate bad bytes in a UTF-8 string
 62* Comes from W3 FAQ: Multilingual Forms
 63* Note: modified to include full ASCII range including control chars
 64* @see http://www.w3.org/International/questions/qa-forms-utf-8
 65* @param string
 66* @return mixed array of integers or FALSE if no bad found
 67* @package utf8
 68* @subpackage bad
 69*/
 70function utf8_bad_findall($str) {
 71    $UTF8_BAD =
 72    '([\x00-\x7F]'.                          # ASCII (including control chars)
 73    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 74    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 75    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 76    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 77    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 78    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 79    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 80    '|(.{1}))';                              # invalid byte
 81    $pos = 0;
 82    $badList = array();
 83    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 84        $bytes = strlen($matches[0]);
 85        if ( isset($matches[2])) {
 86            $badList[] = $pos;
 87        }
 88        $pos += $bytes;
 89        $str = substr($str,$bytes);
 90    }
 91    if ( count($badList) > 0 ) {
 92        return $badList;
 93    }
 94    return FALSE;
 95}
 96
 97//--------------------------------------------------------------------
 98/**
 99* Strips out any bad bytes from a UTF-8 string and returns the rest
100* PCRE Pattern to locate bad bytes in a UTF-8 string
101* Comes from W3 FAQ: Multilingual Forms
102* Note: modified to include full ASCII range including control chars
103* @see http://www.w3.org/International/questions/qa-forms-utf-8
104* @param string
105* @return string
106* @package utf8
107* @subpackage bad
108*/
109function utf8_bad_strip($str) {
110    $UTF8_BAD =
111    '([\x00-\x7F]'.                          # ASCII (including control chars)
112    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
113    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
114    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
115    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
116    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
117    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
118    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
119    '|(.{1}))';                              # invalid byte
120    ob_start();
121    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
122        if ( !isset($matches[2])) {
123            echo $matches[0];
124        }
125        $str = substr($str,strlen($matches[0]));
126    }
127    $result = ob_get_contents();
128    ob_end_clean();
129    return $result;
130}
131
132//--------------------------------------------------------------------
133/**
134* Replace bad bytes with an alternative character - ASCII character
135* recommended is replacement char
136* PCRE Pattern to locate bad bytes in a UTF-8 string
137* Comes from W3 FAQ: Multilingual Forms
138* Note: modified to include full ASCII range including control chars
139* @see http://www.w3.org/International/questions/qa-forms-utf-8
140* @param string to search
141* @param string to replace bad bytes with (defaults to '?') - use ASCII
142* @return string
143* @package utf8
144* @subpackage bad
145*/
146function utf8_bad_replace($str, $replace = '?') {
147    $UTF8_BAD =
148    '([\x00-\x7F]'.                          # ASCII (including control chars)
149    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
150    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
151    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
152    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
153    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
154    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
155    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
156    '|(.{1}))';                              # invalid byte
157    ob_start();
158    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
159        if ( !isset($matches[2])) {
160            echo $matches[0];
161        } else {
162            echo $replace;
163        }
164        $str = substr($str,strlen($matches[0]));
165    }
166    $result = ob_get_contents();
167    ob_end_clean();
168    return $result;
169}
170
171//--------------------------------------------------------------------
172/**
173* Return code from utf8_bad_identify() when a five octet sequence is detected.
174* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
175* do not represent a useful character
176* @see utf8_bad_identify
177* @package utf8
178* @subpackage bad
179*/
180define('UTF8_BAD_5OCTET',1);
181
182/**
183* Return code from utf8_bad_identify() when a six octet sequence is detected.
184* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
185* do not represent a useful character
186* @see utf8_bad_identify
187* @package utf8
188* @subpackage bad
189*/
190define('UTF8_BAD_6OCTET',2);
191
192/**
193* Return code from utf8_bad_identify().
194* Invalid octet for use as start of multi-byte UTF-8 sequence
195* @see utf8_bad_identify
196* @package utf8
197* @subpackage bad
198*/
199define('UTF8_BAD_SEQID',3);
200
201/**
202* Return code from utf8_bad_identify().
203* From Unicode 3.1, non-shortest form is illegal
204* @see utf8_bad_identify
205* @package utf8
206* @subpackage bad
207*/
208define('UTF8_BAD_NONSHORT',4);
209
210/**
211* Return code from utf8_bad_identify().
212* From Unicode 3.2, surrogate characters are illegal
213* @see utf8_bad_identify
214* @package utf8
215* @subpackage bad
216*/
217define('UTF8_BAD_SURROGATE',5);
218
219/**
220* Return code from utf8_bad_identify().
221* Codepoints outside the Unicode range are illegal
222* @see utf8_bad_identify
223* @package utf8
224* @subpackage bad
225*/
226define('UTF8_BAD_UNIOUTRANGE',6);
227
228/**
229* Return code from utf8_bad_identify().
230* Incomplete multi-octet sequence
231* Note: this is kind of a "catch-all"
232* @see utf8_bad_identify
233* @package utf8
234* @subpackage bad
235*/
236define('UTF8_BAD_SEQINCOMPLETE',7);
237
238//--------------------------------------------------------------------
239/**
240* Reports on the type of bad byte found in a UTF-8 string. Returns a
241* status code on the first bad byte found
242* @author <hsivonen@iki.fi>
243* @param string UTF-8 encoded string
244* @return mixed integer constant describing problem or FALSE if valid UTF-8
245* @see utf8_bad_explain
246* @see http://hsivonen.iki.fi/php-utf8/
247* @package utf8
248* @subpackage bad
249*/
250function utf8_bad_identify($str, &$i) {
251
252    $mState = 0;     // cached expected number of octets after the current octet
253                     // until the beginning of the next UTF8 character sequence
254    $mUcs4  = 0;     // cached Unicode character
255    $mBytes = 1;     // cached expected number of octets in the current sequence
256
257    $len = strlen($str);
258
259    for($i = 0; $i < $len; $i++) {
260
261        $in = ord($str{$i});
262
263        if ( $mState == 0) {
264
265            // When mState is zero we expect either a US-ASCII character or a
266            // multi-octet sequence.
267            if (0 == (0x80 & ($in))) {
268                // US-ASCII, pass straight through.
269                $mBytes = 1;
270
271            } else if (0xC0 == (0xE0 & ($in))) {
272                // First octet of 2 octet sequence
273                $mUcs4 = ($in);
274                $mUcs4 = ($mUcs4 & 0x1F) << 6;
275                $mState = 1;
276                $mBytes = 2;
277
278            } else if (0xE0 == (0xF0 & ($in))) {
279                // First octet of 3 octet sequence
280                $mUcs4 = ($in);
281                $mUcs4 = ($mUcs4 & 0x0F) << 12;
282                $mState = 2;
283                $mBytes = 3;
284
285            } else if (0xF0 == (0xF8 & ($in))) {
286                // First octet of 4 octet sequence
287                $mUcs4 = ($in);
288                $mUcs4 = ($mUcs4 & 0x07) << 18;
289                $mState = 3;
290                $mBytes = 4;
291
292            } else if (0xF8 == (0xFC & ($in))) {
293
294                /* First octet of 5 octet sequence.
295                *
296                * This is illegal because the encoded codepoint must be either
297                * (a) not the shortest form or
298                * (b) outside the Unicode range of 0-0x10FFFF.
299                */
300
301                return UTF8_BAD_5OCTET;
302
303            } else if (0xFC == (0xFE & ($in))) {
304
305                // First octet of 6 octet sequence, see comments for 5 octet sequence.
306                return UTF8_BAD_6OCTET;
307
308            } else {
309                // Current octet is neither in the US-ASCII range nor a legal first
310                // octet of a multi-octet sequence.
311                return UTF8_BAD_SEQID;
312
313            }
314
315        } else {
316
317            // When mState is non-zero, we expect a continuation of the multi-octet
318            // sequence
319            if (0x80 == (0xC0 & ($in))) {
320
321                // Legal continuation.
322                $shift = ($mState - 1) * 6;
323                $tmp = $in;
324                $tmp = ($tmp & 0x0000003F) << $shift;
325                $mUcs4 |= $tmp;
326
327                /**
328                * End of the multi-octet sequence. mUcs4 now contains the final
329                * Unicode codepoint to be output
330                */
331                if (0 == --$mState) {
332
333                    // From Unicode 3.1, non-shortest form is illegal
334                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
335                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
336                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
337                        return UTF8_BAD_NONSHORT;
338
339                    // From Unicode 3.2, surrogate characters are illegal
340                    } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
341                        return UTF8_BAD_SURROGATE;
342
343                    // Codepoints outside the Unicode range are illegal
344                    } else if ($mUcs4 > 0x10FFFF) {
345                        return UTF8_BAD_UNIOUTRANGE;
346                    }
347
348                    //initialize UTF8 cache
349                    $mState = 0;
350                    $mUcs4  = 0;
351                    $mBytes = 1;
352                }
353
354            } else {
355                // ((0xC0 & (*in) != 0x80) && (mState != 0))
356                // Incomplete multi-octet sequence.
357                $i--;
358                return UTF8_BAD_SEQINCOMPLETE;
359            }
360        }
361    }
362
363    if ( $mState != 0 ) {
364        // Incomplete multi-octet sequence.
365        $i--;
366        return UTF8_BAD_SEQINCOMPLETE;
367    }
368
369    // No bad octets found
370    $i = NULL;
371    return FALSE;
372}
373
374//--------------------------------------------------------------------
375/**
376* Takes a return code from utf8_bad_identify() are returns a message
377* (in English) explaining what the problem is.
378* @param int return code from utf8_bad_identify
379* @return mixed string message or FALSE if return code unknown
380* @see utf8_bad_identify
381* @package utf8
382* @subpackage bad
383*/
384function utf8_bad_explain($code) {
385
386    switch ($code) {
387
388        case UTF8_BAD_5OCTET:
389            return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
390        break;
391
392        case UTF8_BAD_6OCTET:
393            return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
394        break;
395
396        case UTF8_BAD_SEQID:
397            return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
398        break;
399
400        case UTF8_BAD_NONSHORT:
401            return 'From Unicode 3.1, non-shortest form is illegal';
402        break;
403
404        case UTF8_BAD_SURROGATE:
405            return 'From Unicode 3.2, surrogate characters are illegal';
406        break;
407
408        case UTF8_BAD_UNIOUTRANGE:
409            return 'Codepoints outside the Unicode range are illegal';
410        break;
411
412        case UTF8_BAD_SEQINCOMPLETE:
413            return 'Incomplete multi-octet sequence';
414        break;
415
416    }
417
418    trigger_error('Unknown error code: '.$code,E_USER_WARNING);
419    return FALSE;
420
421}