/src/wrappers/glib/partially-implemented/utf8_string.e
Specman e | 2179 lines | 1329 code | 237 blank | 613 comment | 77 complexity | c247625d7eace6f35615ec29190bda03 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1indexing 2 description: "Unicode UTF-8 string." 3 copyright: "[ 4 Copyright (C) 2006 Paolo Redaelli, Glib team 5 6 This library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public License 8 as published by the Free Software Foundation; either version 2.1 of 9 the License, or (at your option) any later version. 10 11 This library is distributed in the hopeOA that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with this library; if not, write to the Free Software 18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA 20 ]" 21 22class UTF8_STRING 23 -- A string of Unicode characters encoded into UTF-8. 24 25 -- This particular encoding trades access efficiency for space efficiency: 26 -- it uses a variable amount of memory to store each character, from 8 to 27 -- 32 bits, thus making traversal and random-access costly; AFAIK random 28 -- access is O(n) and trasversal while being still an O(1) operation is 29 -- much more complex and costly than traversing a normal STRING. 30 31 -- UNICODE_MANIPULATION 32 33 -- A number of functions for dealing with Unicode characters and 34 -- strings. There are analogues of the traditional C functions 35 -- found in ctype.h character classification and case conversion 36 -- functions, UTF-8 analogues of some string utility functions, 37 -- functions to perform normalization, case conversion and 38 -- collation on UTF-8 strings and finally functions to convert 39 -- between the UTF-8, UTF-16 and UCS-4 encodings of Unicode. 40 41 -- The implementations of the Unicode functions in GLib are based 42 -- on the Unicode Character Data tables, which are available from 43 -- www.unicode.org. GLib 2.8 supports Unicode 4.0, GLib 2.10 44 -- supports Unicode 4.1, GLib 2.12 supports Unicode 5.0. 45 46inherit 47 HASHABLE 48 redefine copy 49 end 50 COMPARABLE 51 redefine is_equal, copy, compare, three_way_comparison 52 end 53 TRAVERSABLE[UNICODE_CHARACTER] 54 redefine is_equal, copy 55 end 56 RECYCLABLE 57 redefine is_equal, copy 58 end 59 WRAPPER 60 61insert 62 GUNICODE_EXTERNALS 63 GUNICODE_MACROS 64 GMEM_EXTERNALS 65 66creation {ANY} 67 make, copy, make_empty, make_filled, from_utf8 68 69feature {ANY} 70 capacity: NATURAL_32 71 -- String capacity in bytes 72 73 count: NATURAL_32 74 -- String length in characters 75 76 lower: NATURAL_32 is do Result:=0.to_natural_32 end 77 -- Minimum index; currently it is 0 to mimic C strings; note that 78 -- correct coding style shall not rely on the actual value of this 79 -- feature. 80 81 upper: NATURAL_32 is 82 -- Maximum index 83 do 84 Result:=count-1.to_natural_32 85 end 86 87feature {} -- 88 bytes_count: NATURAL_32 89 -- size in bytes. When you use non-ASCII characters it will be different than count 90 91feature {ANY} -- Creation / Modification: 92 make (needed_capacity: NATURAL_32) is 93 -- Initialize the string to have at least `needed_capacity' bytes 94 -- of storage. 95 do 96 if needed_capacity > 0.to_natural_32 then 97 if capacity < needed_capacity then 98 wrapper := g_try_malloc(needed_capacity) 99 if wrapper.is_null then raise(No_more_memory) 100 else capacity := needed_capacity 101 end 102 end 103 end 104 count := 0 105 ensure 106 needed_capacity <= capacity 107 empty_string: count = 0 108 end 109 110 make_empty is 111 -- Create an empty string. 112 do 113 make(0) 114 end 115 116 make_filled (a_character: UNICODE_CHARACTER; n: NATURAL_32) is 117 -- Initialize string with `n' copies of `a_character'. 118 require 119 a_character.is_valid 120 do 121 make(n) 122 count := n 123 fill_with(a_character) 124 ensure 125 count_set: count = n 126 filled: occurrences(a_character) = count 127 end 128 129 from_string (a_string: STRING) is 130 -- Create an UTF8 string from `a_string'. 131 require a_string/=Void 132 local validated: BOOLEAN; an_end: POINTER 133 do 134 validated := g_utf8_validate (a_string.to_external, a_string.count, $an_end).to_boolean 135 if validated then 136 handle := g_memdup(a_string.to_external, a_string.count) 137 bytes_count := a_string.count 138 capacity := bytes_count 139 count := g_utf8_strlen(handle, bytes_count) 140 else 141 raise(Non_valid_utf8_string) 142 end 143 end 144 145 Non_valid_utf8_string: STRING is "Given string is not UTF8 valid." 146 147feature {WRAPPER_HANDLER} 148 from_pointer (a_pointer: POINTER) is 149 -- Create an UTF8 using the content found into `a_pointer' which is a zero-terminated string and it is not copied 150 local validated: BOOLEAN; an_end: POINTER 151 do 152 validated := g_utf8_validate (a_pointer, -1, $an_end).to_boolean 153 if validated then 154 handle := a_pointer 155 bytes_count := an_end - a_pointer 156 capacity := bytes_count 157 count := g_utf8_strlen(handle, bytes_count) 158 else 159 raise(Non_valid_utf8_string) 160 end 161 end 162 163feature {ANY} -- Testing: 164 is_empty: BOOLEAN is 165 -- Has string length 0? 166 -- 167 -- See also `count'. 168 do 169 Result := count = 0 170 end 171 172 item (i: like lower): UNICODE_CHARACTER is 173 -- Get unicode at position `i'. 174 -- 175 -- See also `lower', `upper', `valid_index', `put'. 176 require valid_index(i) 177 local location: POINTER 178 do 179 Result.set(g_utf8_get_char(handle+g_utf8_offset_to_pointer(handle,i))) 180 end 181 182 infix "@" (i: like lower): is 183 -- The infix notation which is actually just a synonym for `item'. 184 -- 185 -- See also `item', `put'. 186 require valid_index(i) 187 do 188 Result := item(i) 189 ensure definition: Result = item(i) 190 end 191 192 hash_code: INTEGER is 193 local i: ITERATOR_ON_UTF8_STRING 194 do 195 from i:=get_new_iterator; i.start 196 until i.is_off 197 loop 198 Result := {INTEGER 5} #* Result #+ i.item.code.hash_code 199 i.next 200 end 201 if Result < 0 then 202 Result := ~Result 203 end 204 end 205 206 infix "<" (other: like Current): BOOLEAN is 207 -- Is `Current' less than `other'? 208 -- 209 -- See also `>', `<=', `>=', `min', `max'. 210 local 211 i,j: ITERATOR_ON_UTF8_STRING 212 do 213 from 214 i:=get_new_iterator; i.start 215 j:=other.get_new_iterator; j.start 216 maxi := count.min(other.count) 217 until (i.is_off or j.is_off) or else i.item/=j.item 218 loop i.next; j.next 219 end 220 if i.is_off or j.is_off then 221 Result := count < other.count 222 else Result := i.item < j.item 223 end 224 end 225 226 compare, three_way_comparison (other: like Current): INTEGER is 227 -- Compares Current with `other' using the linguistically correct rules 228 -- for the current locale. Result is < 0 if Current compares before 229 -- `other', 0 if they compare equal, > 0 if Current compares after 230 -- `other'. When sorting a large number of strings, it will be 231 -- significantly faster to obtain collation keys with `collate_key' 232 -- and compare the keys when sorting instead of sorting the original 233 -- strings. Note: in C langauge comparison of collated key is done with 234 -- function strcmp; AFAIK it should be the same comparing Eiffel 235 -- STRINGs Paolo 2009-06-22 236 do 237 Result :=g_utf8_collate(handle,other.handle) 238 239 end 240 241 is_equal (other: like Current): BOOLEAN is 242 -- Do both strings have the same character sequence? 243 -- 244 -- See also `same_as'. 245 local i,j: ITERATOR_ON_UTF8_STRING 246 do 247 if Current = other then Result := True 248 else 249 if count = other.count then 250 -- TODO: Could be improved. If stored in canonical form 251 -- direct memory comparison could be made. Paolo 2009-06-20 252 from 253 Result := True 254 i := Current.get_new_iterator; i.start 255 j := other.get_new_iterator; j.start 256 until i.is_off or else Result=False 257 loop 258 check not i.is_off implied not j.is_off end 259 Result := i.item = j.item 260 i.next; j.next 261 end 262 else Result:=False 263 end 264 end 265 end 266 267 same_as (other: UNICODE_STRING): BOOLEAN is 268 -- Case insensitive `is_equal'. 269 require 270 other /= Void 271 local i,j: ITERATOR_ON_UTF8_STRING 272 do 273 if count=other.count then 274 from 275 i := Current.get_new_iterator; i.start 276 j := other.get_new_iterator; j.start 277 Result := True 278 until i.is_off or else Result = False 279 loop 280 Result := i.item.to_lower = j.item.to_lower 281 i.next; j.next 282 end 283 else Result:=False 284 end 285 end 286 287 index_of (a_character: like item; a_start_index: like lower): REFERENCE[like lower] is 288 -- Index of first occurrence of `a_character' at or after `a_start_index' 289 -- Void if not found. 290 -- 291 -- See also `reverse_index_of', `first_index_of', `last_index_of', `has'. 292 require 293 valid_start_index: start_index >= 1 and start_index <= count + 1 294 a_character.is_valid 295 local location: POINTER; integer_result: INTEGER_32 296 do 297 not_yet_implemented 298 -- Not correct 299 location := g_utf8_strchr(handle, -1, a_character.code) 300 if location.is_not_null then 301 integer_result := g_utf8_pointer_to_offset(handle,location) 302 check integer_result > 0 end 303 create Result.set_item(integer_result.to_natural_32) 304 end 305 ensure 306 Result /= Void implies item(Result.item) = a_character 307 end 308 309 reverse_index_of (a_character: like item; a_start_index: like lower): REFERENCE[like lower] is 310 -- Index of first occurrence of `a_character' at or before 311 -- `a_start_index'; Void if none. 312 313 -- The search is done in reverse direction, which means from the `start_index' down 314 -- to the first character. 315 -- 316 -- See also `index_of', `last_index_of', `first_index_of'. 317 require 318 valid_start_index: start_index >= 0 and start_index <= count 319 a_character.is_valid 320 local location: POINTER; integer_result: INTEGER_32 321 do 322 not_yet_implemented 323 -- Not correct 324 location := g_utf8_strrchr(handle, -1, a_character.code) 325 if location.is_not_null then 326 integer_result := g_utf8_pointer_to_offset(handle,location) 327 check integer_result > 0 end 328 create Result.set_item(integer_result.to_natural_32) 329 end 330 ensure 331 Result /= Void implies item(Result.item) = a_character 332 end 333 334 first_index_of (a_character: like item): REFERENCE[like lower] is 335 -- Index of first occurrence of `a_character'. 336 -- 337 -- See also `last_index_of', `index_of', `reverse_index_of'. 338 require 339 a_character.is_valid 340 local location: POINTER; integer_result: INTEGER_32 341 do 342 location := g_utf8_strchr(handle, -1, a_character.code) 343 if location.is_not_null then 344 integer_result := g_utf8_pointer_to_offset(handle,location) 345 check integer_result > 0 end 346 create Result.set_item(integer_result.to_natural_32) 347 end 348 ensure 349 definition: Resulti/=Void implies Result.is_equal(index_of(a_character,lower)) 350 end 351 352 last_index_of (unicode: like item): REFERENCE[like lower] is 353 -- Index of last occurrence of `unicode', 0 if none. 354 -- 355 -- See also `first_index_of', `reverse_index_of', `index_of'. 356 do 357 not_yet_implemented 358 ensure 359 definition: Result = reverse_index_of(unicode, upper) 360 end 361 362 has (a_character: like first): BOOLEAN is 363 -- True if `unicode' is in the STRING. 364 -- 365 -- See also `index_of', `occurrences', `has_substring'. 366 require 367 valid_unicode_value: valid_unicode(unicode) 368 do 369 Result := index_of(unicode, 1) /= 0 370 end 371 372 has_substring (other: UNICODE_STRING): BOOLEAN is 373 -- True if `Current' contains `other'. 374 -- 375 -- See also `substring_index', `has'. 376 require 377 other_not_void: other /= Void 378 do 379 Result := substring_index(other, 1) /= 0 380 end 381 382 occurrences (unicode: INTEGER): INTEGER is 383 -- Number of times character `unicode' appears in the string. 384 -- 385 -- See also `remove_all_occurrences', `has'. 386 require 387 valid_unicode_value: valid_unicode(unicode) 388 local 389 i: INTEGER 390 do 391 from 392 i := index_of(unicode, 1) 393 until 394 i = 0 395 loop 396 Result := Result + 1 397 i := index_of(unicode, i) 398 end 399 ensure 400 Result >= 0 401 end 402 403 has_suffix (s: UNICODE_STRING): BOOLEAN is 404 -- True if suffix of `Current' is `s'. 405 -- 406 -- See also `remove_suffix', `has_prefix', `has_substring'. 407 require 408 s /= Void 409 local 410 i, offset: INTEGER 411 do 412 offset := count - s.count 413 from 414 Result := offset >= 0 415 i := lower 416 until 417 not Result or else i > s.upper 418 loop 419 Result := item(i + offset) = s.item(i) 420 i := i + 1 421 end 422 end 423 424 has_prefix (p: UNICODE_STRING): BOOLEAN is 425 -- True if prefix of `Current' is `p'. 426 require 427 p /= Void 428 local 429 i: INTEGER 430 do 431 from 432 Result := count >= p.count 433 i := lower 434 until 435 not Result or else i > p.upper 436 loop 437 Result := item(i) = p.item(i) 438 i := i + 1 439 end 440 end 441 442feature {ANY} -- Testing and Conversion: 443 is_ascii: BOOLEAN is 444 -- True if all unicode value is in range 0..127 445 local 446 i: INTEGER 447 do 448 from 449 i := count - 1 450 until 451 i < 0 or else storage.item(i) & 0xFF80 /= 0 452 loop 453 i := i - 1 454 end 455 Result := i < 0 456 end 457 458 to_utf8: STRING is 459 -- New string is created, current unicode string is encoded 460 -- with UTF-8 format. 461 -- 462 -- See also: `utf8_encode_in' and `as_utf8' to save memory. 463 do 464 tmp_buffer.clear_count 465 utf8_encode_in(tmp_buffer) 466 Result := tmp_buffer.twin 467 end 468 469 to_string: STRING is 470 obsolete "Now use `to_utf8' instead (May 2008)." 471 do 472 Result := to_utf8 473 end 474 475 as_utf8: STRING is 476 -- Encode the string in UTF-8. Always returns the same once object. 477 -- 478 -- See also: `to_utf8', `utf8_encode_in'. 479 do 480 Result := once "" 481 Result.clear_count 482 utf8_encode_in(Result) 483 end 484 485 as_string: STRING is 486 obsolete "Now use `as_utf8' instead (May 2008)." 487 do 488 Result := as_utf8 489 end 490 491 utf8_encode_in (s: STRING) is 492 -- Append the string in UTF-8 to `s'. 493 -- 494 -- See also: `to_utf8', `as_utf8'. 495 require 496 s /= Void 497 local 498 i: INTEGER; v: INTEGER 499 do 500 from 501 i := 1 502 until 503 i > count 504 loop 505 v := item(i) 506 if v < 128 then 507 s.extend(v.to_character) 508 elseif v < 2048 then 509 s.extend((v #// 64 + 192).to_character) 510 s.extend((v #\\ 64 + 128).to_character) 511 elseif v < 65536 then 512 s.extend((v #// 4096 + 224).to_character) 513 v := v #\\ 4096 514 s.extend((v #// 64 + 128).to_character) 515 s.extend((v #\\ 64 + 128).to_character) 516 else 517 check 518 v < 0x00110000 519 end 520 s.extend((v #// 0x00040000 + 240).to_character) 521 v := v #\\ 0x00040000 522 s.extend((v #// 0x00001000 + 128).to_character) 523 v := v #\\ 0x00001000 524 s.extend((v #// 64 + 128).to_character) 525 s.extend((v #\\ 64 + 128).to_character) 526 end 527 i := i + 1 528 end 529 end 530 531 utf16be_encode_in (s: STRING) is 532 -- Append the string in UTF-16BE to `s' 533 require 534 s /= Void 535 local 536 i, k: INTEGER; v: INTEGER_16 537 do 538 from 539 until 540 i >= count 541 loop 542 v := storage.item(i) 543 s.extend((v |>>> 8).to_character) 544 s.extend((v & 0x00FF).to_character) 545 if v & 0xF800 = 0xD800 then 546 check 547 low_surrogate_indexes.item(k) = i + 1 548 end 549 s.extend((low_surrogate_values.item(k) #// 256 + 220).to_character) 550 s.extend((low_surrogate_values.item(k) & 0x00FF).to_character) 551 k := k + 1 552 end 553 i := i + 1 554 end 555 end 556 557 utf8_decode_from (s: STRING): BOOLEAN is 558 -- Use `s' as UTF-8 format encoded unicode string 559 -- Return `False' if decoding process failed 560 require 561 s /= Void 562 local 563 i, k, seq_length: INTEGER; v: INTEGER 564 do 565 from 566 Result := True 567 i := 1 568 until 569 i > s.count 570 loop 571 v := s.item(i).code 572 i := i + 1 573 inspect 574 v 575 when 0 .. 127 then 576 extend(v) 577 k := 0 578 when 192 .. 223 then 579 v := v - 192 580 k := 2 581 when 224 .. 239 then 582 v := v - 224 583 k := 3 584 when 240 .. 247 then 585 v := v - 240 586 k := 4 587 else 588 extend(65533) 589 Result := False 590 k := 0 591 end 592 from 593 seq_length := k 594 until 595 k <= 1 596 loop 597 if i <= s.count and then s.item(i).code.in_range(128, 191) then 598 v := v * 64 + s.item(i).code - 128 599 i := i + 1 600 k := k - 1 601 else 602 extend(65533) 603 Result := False 604 k := 0 605 end 606 end 607 if k = 1 then 608 if v < 128 or else v < 2048 and then seq_length > 2 or else v < 65536 and then seq_length > 3 then 609 -- overlong sequence, must be refused by any UTF-8 610 -- complient decode for security reasons. 611 extend(65533) 612 Result := False 613 elseif not valid_unicode(v) then 614 extend(65533) 615 Result := False 616 else 617 extend(v) 618 end 619 end 620 end 621 end 622 623feature {} 624 from_utf8 (s: STRING) is 625 -- Use `s' as UTF-8 format encoded unicode string 626 -- This function may be used for manifest strings 627 -- See `utf8_decode_from' for error detection 628 require 629 s /= Void 630 local 631 error: BOOLEAN 632 do 633 make(s.count) 634 error := utf8_decode_from(s) 635 end 636 637feature {ANY} -- Modification: 638 resize (new_count: INTEGER) is 639 -- Resize Current. When `new_count' is greater than 640 -- `count', new positions are initialized with unicode 0. 641 require 642 new_count >= 0 643 local 644 i: INTEGER 645 do 646 if new_count <= count then 647 elseif capacity < new_count then 648 if capacity = 0 then 649 storage := storage.calloc(new_count) 650 else 651 storage := storage.realloc(capacity, new_count) 652 end 653 capacity := new_count 654 else 655 storage.clear(count, new_count - 1) 656 end 657 count := new_count 658 from 659 i := low_surrogate_indexes.upper 660 until 661 i < 0 or else low_surrogate_indexes.item(i) <= new_count 662 loop 663 --TODO: only one remove out of the loop 664 low_surrogate_indexes.remove_last 665 low_surrogate_values.remove_last 666 i := i - 1 667 end 668 ensure 669 count = new_count 670 capacity >= old capacity 671 end 672 673 clear_count, wipe_out is 674 -- Discard all characters so that `is_empty' is True after that call. 675 -- The internal `capacity' is not changed by this call (i.e. the internal `storage' memory is 676 -- neither released nor shrunk). 677 -- 678 -- See also `clear_count_and_capacity'. 679 do 680 count := 0 681 low_surrogate_indexes.make(0) 682 low_surrogate_values.make(0) 683 ensure 684 is_empty: count = 0 685 capacity = old capacity 686 end 687 688 clear_count_and_capacity is 689 -- Discard all characters (`is_empty' is True after that call). The internal `capacity' may also be 690 -- reduced after this call. 691 -- 692 -- See also `clear_count'. 693 do 694 low_surrogate_indexes.clear_count_and_capacity 695 low_surrogate_values.clear_count_and_capacity 696 clear_count 697 --*** capacity := 0 698 --*** storage := null_storage 699 ensure 700 is_empty: count = 0 701 capacity = 0 702 end 703 704 copy (other: like Current) is 705 -- Copy `other' onto Current. 706 -- 707 -- See also `copy_substring'. 708 do 709 count := other.count 710 if count > 0 then 711 if capacity < count then 712 storage := storage.calloc(count) 713 capacity := count 714 end 715 storage.copy_from(other.storage, count - 1) 716 end 717 if low_surrogate_indexes = Void then 718 create low_surrogate_indexes.make(0) 719 create low_surrogate_values.make(0) 720 end 721 low_surrogate_indexes.copy(other.low_surrogate_indexes) 722 low_surrogate_values.copy(other.low_surrogate_values) 723 ensure then 724 count = other.count 725 end 726 727 copy_substring (s: like Current; start_index, end_index: INTEGER) is 728 -- Copy the substring from `s' from `start_index' to `end_index' 729 -- to Current. 730 -- 731 -- See also `copy'. 732 --|*** DUMB IMPLEMENTATION 733 require 734 string_not_void: s /= Void 735 valid_start_index: 1 <= start_index 736 valid_end_index: end_index <= s.count 737 meaningful_interval: start_index <= end_index + 1 738 do 739 clear_count 740 append_substring(s, start_index, end_index) 741 end 742 743 fill_with (unicode: INTEGER) is 744 -- Replace every unicode with the new value. 745 require 746 valid_unicode_value: valid_unicode(unicode) 747 local 748 i: INTEGER; code: INTEGER_16; remainder: INTEGER_16 749 do 750 if unicode >= 65536 then 751 -- stored as high and low surrogate 752 code := (unicode #// 1024 - 64).low_16 753 remainder := (unicode & 0x000003FF).to_integer_16 --unicode #\\ 1024 754 storage.set_all_with(code, count - 1) 755 low_surrogate_values.resize(count) 756 low_surrogate_values.set_all_with(remainder) 757 from 758 i := count - 1 759 low_surrogate_indexes.resize(count) 760 until 761 i < 0 762 loop 763 low_surrogate_indexes.put(i + 1, i) 764 i := i - 1 765 end 766 else 767 code := unicode.low_16 768 storage.set_all_with(code, count - 1) 769 low_surrogate_values.resize(0) 770 low_surrogate_indexes.resize(0) 771 end 772 ensure 773 occurrences(unicode) = count 774 end 775 776 replace_all (old_code, new_code: like item) is 777 -- Replace all occurrences of the element `old_code' by `new_code'. 778 require 779 valid_unicode_value: valid_unicode(old_code) 780 valid_unicode_value: valid_unicode(new_code) 781 local 782 i: INTEGER 783 do 784 --*** May be implemented in a more efficient way... 785 if old_code /= new_code then 786 from 787 i := index_of(old_code, 1) 788 until 789 i = 0 790 loop 791 put(new_code, i) 792 i := index_of(old_code, i + 1) 793 end 794 end 795 ensure 796 count = old count 797 old_code /= new_code implies occurrences(old_code) = 0 798 end 799 800 append, append_string (s: UNICODE_STRING) is 801 -- Append a copy of 's' to `Current'. 802 -- 803 -- See also `add_last', `add_first', `prepend', '+'. 804 require 805 s_not_void: s /= Void 806 local 807 s_count, needed_capacity, new_capacity, i: INTEGER; indexes: FAST_ARRAY[INTEGER] 808 do 809 s_count := s.count 810 needed_capacity := count + s_count 811 if needed_capacity > capacity then 812 if capacity = 0 then 813 storage := storage.calloc(needed_capacity) 814 capacity := needed_capacity 815 else 816 new_capacity := (2 * capacity).max(needed_capacity) 817 storage := storage.realloc(capacity, new_capacity) 818 capacity := new_capacity 819 end 820 end 821 storage.copy_at(count, s.storage, s_count) 822 from 823 indexes := s.low_surrogate_indexes 824 until 825 i > indexes.upper 826 loop 827 low_surrogate_indexes.add_last(indexes.item(i) + count) 828 low_surrogate_values.add_last(s.low_surrogate_values.item(i)) 829 i := i + 1 830 end 831 count := needed_capacity 832 end 833 834 append_substring (s: like Current; start_index, end_index: INTEGER) is 835 -- Append the substring from `s' from `start_index' to `end_index' 836 -- to Current. 837 --|*** DUMB IMPLEMENTATION 838 require 839 string_not_void: s /= Void 840 valid_start_index: 1 <= start_index 841 valid_end_index: end_index <= s.count 842 meaningful_interval: start_index <= end_index + 1 843 local 844 i: INTEGER 845 do 846 from 847 i := start_index 848 until 849 i > end_index 850 loop 851 extend(s.item(i)) 852 i := i + 1 853 end 854 end 855 856 prepend (other: UNICODE_STRING) is 857 -- Prepend `other' to `Current'. 858 -- 859 -- See also `append'. 860 require 861 other /= Void 862 local 863 i, j, k: INTEGER 864 do 865 i := count 866 j := other.count 867 resize(i + j) 868 if i > 0 and then j > 0 then 869 storage.move(0, i - 1, j) 870 from 871 k := low_surrogate_indexes.upper 872 until 873 k < 0 874 loop 875 low_surrogate_indexes.put(low_surrogate_indexes.item(k) + j, k) 876 k := k - 1 877 end 878 end 879 -- May be implemented in a more efficient way... 880 from 881 k := other.low_surrogate_indexes.upper 882 until 883 k < 0 884 loop 885 low_surrogate_indexes.add_first(other.low_surrogate_indexes.item(k)) 886 low_surrogate_values.add_first(other.low_surrogate_values.item(k)) 887 k := k - 1 888 end 889 storage.copy_from(other.storage, j - 1) 890 ensure 891 (old other.twin + old Current.twin).is_equal(Current) 892 end 893 894 insert_string (s: UNICODE_STRING; i: INTEGER) is 895 -- Insert `s' at index `i', shifting characters from index `i' 896 -- to `count' rightwards. 897 require 898 string_not_void: s /= Void 899 valid_insertion_index: 1 <= i and i <= count + 1 900 local 901 j, k: INTEGER; pos, n: INTEGER 902 do 903 j := count 904 k := s.count 905 resize(j + k) 906 if i <= j then 907 storage.move(i - 1, j - 1, k) 908 end 909 storage.copy_at(i - 1, s.storage, k) 910 pos := low_surrogate_position(i) 911 j := low_surrogate_indexes.count + s.low_surrogate_indexes.count 912 low_surrogate_indexes.resize(j) 913 low_surrogate_values.resize(j) 914 from 915 -- move existing surrogates and adjust indexes 916 n := s.low_surrogate_indexes.upper 917 until 918 n < 0 919 loop 920 j := j - 1 921 low_surrogate_indexes.put(low_surrogate_indexes.item(pos + n) + k, j) 922 low_surrogate_values.put(low_surrogate_values.item(pos + n), j) 923 n := n - 1 924 end 925 from 926 -- copy surrogates from s and adjust indexes 927 n := s.low_surrogate_indexes.upper 928 j := pos + n 929 until 930 n < 0 931 loop 932 low_surrogate_indexes.put(s.low_surrogate_indexes.item(n) + i, j) 933 low_surrogate_values.put(s.low_surrogate_values.item(n), j) 934 j := j - 1 935 n := n - 1 936 end 937 end 938 939 replace_substring (s: UNICODE_STRING; start_index, end_index: INTEGER) is 940 -- Replace the substring from `start_index' to `end_index', 941 -- inclusive, with `s'. 942 require 943 string_not_void: s /= Void 944 valid_start_index: 1 <= start_index 945 valid_end_index: end_index <= count 946 meaningful_interval: start_index <= end_index + 1 947 do 948 -- May be implemented in a more efficient way... 949 remove_between(start_index, end_index) 950 insert_string(s, start_index) 951 end 952 953 infix "+" (other: UNICODE_STRING): like Current is 954 -- Create a new UNICODE_STRING which is the concatenation of 955 -- `Current' and `other'. 956 -- 957 -- See also `append'. 958 require 959 other_exists: other /= Void 960 do 961 create Result.make(count + other.count) 962 Result.append(Current) 963 Result.append(other) 964 ensure 965 result_count: Result.count = count + other.count 966 end 967 968 put (unicode: INTEGER; i: INTEGER) is 969 -- Put `unicode' at position `i'. 970 -- 971 -- See also `item', `lower', `upper', `swap'. 972 require 973 valid_index: valid_index(i) 974 valid_unicode_value: valid_unicode(unicode) 975 local 976 v, n: INTEGER 977 do 978 if unicode >= 65536 then 979 -- stored as high and low surrogate 980 v := unicode #// 1024 - 64 981 if storage.item(i - 1) & 0xF800 = 0xD800 then 982 low_surrogate_values.put((unicode & 0x000003FF).to_integer_16, low_surrogate_index(i)) 983 else 984 n := low_surrogate_position(i) 985 low_surrogate_indexes.add_last(0) 986 low_surrogate_values.add_last(0) 987 if n /= low_surrogate_indexes.upper then 988 low_surrogate_indexes.move(n, low_surrogate_indexes.upper - 1, 1) 989 low_surrogate_values.move(n, low_surrogate_values.upper - 1, 1) 990 end 991 low_surrogate_indexes.put(i, n) 992 low_surrogate_values.put((unicode & 0x000003FF).to_integer_16, n) 993 end 994 storage.put(v.low_16, i - 1) 995 else 996 if storage.item(i - 1) & 0xF800 = 0xD800 then 997 v := low_surrogate_index(i) 998 low_surrogate_indexes.remove(v) 999 low_surrogate_values.remove(v) 1000 end 1001 storage.put(unicode.low_16, i - 1) 1002 end 1003 ensure 1004 item(i) = unicode 1005 end 1006 1007 swap (i1, i2: INTEGER) is 1008 -- Swap two characters. 1009 -- 1010 -- See also `item', `put'. 1011 require 1012 valid_index(i1) 1013 valid_index(i2) 1014 local 1015 tmp: INTEGER_16; j1, j2: INTEGER; low_tmp: INTEGER_16; k1, k2: INTEGER 1016 do 1017 j1 := i1 - 1 1018 j2 := i2 - 1 1019 tmp := storage.item(j1) 1020 if tmp & 0xF800 = 0xD800 then 1021 if storage.item(j2) & 0xF800 = 0xD800 then 1022 k1 := low_surrogate_index(i1) 1023 k2 := low_surrogate_index(i2) 1024 low_tmp := low_surrogate_values.item(k1) 1025 low_surrogate_values.put(low_surrogate_values.item(k2), k1) 1026 low_surrogate_values.put(low_tmp, k2) 1027 low_surrogate_indexes.put(i2, k1) 1028 low_surrogate_indexes.put(i1, k2) 1029 else 1030 low_tmp := low_surrogate_values.item(k1) 1031 k1 := low_surrogate_index(i1) 1032 k2 := low_surrogate_position(i2) 1033 if k2 > k1 + 1 then 1034 low_surrogate_indexes.move(k1 + 1, k2 - 1, -1) 1035 low_surrogate_values.move(k1 + 1, k2 - 1, -1) 1036 k2 := k2 - 1 1037 elseif k1 > k2 then 1038 low_surrogate_indexes.move(k2, k1 - 1, 1) 1039 low_surrogate_values.move(k2, k1 - 1, 1) 1040 --else no move 1041 end 1042 low_surrogate_indexes.put(i1, k2) 1043 low_surrogate_values.put(low_tmp, k2) 1044 end 1045 else 1046 if storage.item(j2) & 0xF800 = 0xD800 then 1047 low_tmp := low_surrogate_values.item(k2) 1048 k1 := low_surrogate_position(i1) 1049 k2 := low_surrogate_index(i2) 1050 if k1 > k2 + 1 then 1051 low_surrogate_indexes.move(k2 + 1, k1 - 1, -1) 1052 low_surrogate_values.move(k2 + 1, k1 - 1, -1) 1053 k1 := k1 - 1 1054 elseif k2 > k1 then 1055 low_surrogate_indexes.move(k1, k2 - 1, 1) 1056 low_surrogate_values.move(k1, k2 - 1, 1) 1057 --else no move 1058 end 1059 low_surrogate_indexes.put(i2, k1) 1060 low_surrogate_values.put(low_tmp, k1) 1061 -- else i1 and i2 are not surrogate 1062 end 1063 end 1064 storage.put(storage.item(j2), j1) 1065 storage.put(tmp, j2) 1066 ensure 1067 item(i1) = old item(i2) 1068 item(i2) = old item(i1) 1069 end 1070 1071 insert_character (unicode: INTEGER; i: INTEGER) is 1072 -- Inserts `unicode' at index `i', shifting characters from 1073 -- position 'i' to `count' rightwards. 1074 require 1075 valid_insertion_index: 1 <= i and i <= count + 1 1076 valid_unicode_value: valid_unicode(unicode) 1077 local 1078 j, k: INTEGER 1079 do 1080 k := low_surrogate_position(i) 1081 from 1082 j := low_surrogate_indexes.upper 1083 until 1084 j < k 1085 loop 1086 low_surrogate_indexes.put(low_surrogate_indexes.item(j) + 1, j) 1087 j := j - 1 1088 end 1089 resize(count + 1) 1090 if count > 1 then 1091 storage.move(i - 1, count - 2, 1) 1092 storage.put(0, i - 1) 1093 end 1094 put(unicode, i) 1095 ensure 1096 item(i) = unicode 1097 end 1098 1099 shrink (min_index, max_index: INTEGER) is 1100 -- Keep only the slice [`min_index' .. `max_index'] or nothing 1101 -- when the slice is empty. 1102 require 1103 1 <= min_index 1104 max_index <= count 1105 min_index <= max_index + 1 1106 local 1107 i, j: INTEGER 1108 do 1109 if max_index < min_index then 1110 count := 0 1111 low_surrogate_indexes.make(0) 1112 low_surrogate_values.make(0) 1113 elseif min_index = 1 then 1114 count := max_index 1115 i := low_surrogate_position(count) 1116 if i <= low_surrogate_indexes.upper then 1117 if low_surrogate_indexes.item(i) = max_index then 1118 i := i + 1 1119 end 1120 end 1121 low_surrogate_indexes.resize(i) 1122 low_surrogate_values.resize(i) 1123 else 1124 storage.slice_copy(0, storage, min_index - 1, max_index - 1) 1125 from 1126 i := low_surrogate_position(min_index) 1127 until 1128 i > low_surrogate_indexes.upper or else low_surrogate_indexes.item(i) > max_index 1129 loop 1130 low_surrogate_indexes.put(low_surrogate_indexes.item(i) - min_index + 1, j) 1131 low_surrogate_values.put(low_surrogate_values.item(i), j) 1132 j := j + 1 1133 i := i + 1 1134 end 1135 low_surrogate_indexes.resize(j) 1136 low_surrogate_values.resize(j) 1137 count := max_index - min_index + 1 1138 end 1139 ensure 1140 count = max_index - min_index + 1 1141 end 1142 1143 remove (i: INTEGER) is 1144 -- Remove character at position `i'. 1145 -- 1146 -- See also `remove_head', `remove_between', `remove_suffix', `remove_prefix'. 1147 require 1148 valid_removal_index: valid_index(i) 1149 do 1150 remove_between(i, i) 1151 ensure 1152 count = old count - 1 1153 end 1154 1155 add_first, precede (unicode: INTEGER) is 1156 -- Add `unicode' at first position. 1157 -- 1158 -- See also `add_last'. 1159 require 1160 valid_unicode_value: valid_unicode(unicode) 1161 local 1162 i: INTEGER 1163 do 1164 from 1165 i := low_surrogate_indexes.upper 1166 until 1167 i < 0 1168 loop 1169 low_surrogate_indexes.put(low_surrogate_indexes.item(i) + 1, i) 1170 i := i - 1 1171 end 1172 resize(count + 1) 1173 if count > 1 then 1174 storage.move(0, count - 2, 1) 1175 storage.put(0, 0) 1176 end 1177 put(unicode, 1) 1178 ensure 1179 count = 1 + old count 1180 item(1) = unicode 1181 end 1182 1183 add_last, append_character, extend (unicode: INTEGER) is 1184 -- Append `unicode' to string. 1185 -- 1186 -- See also `add_first'. 1187 require 1188 valid_unicode_value: valid_unicode(unicode) 1189 local 1190 new_capacity: INTEGER 1191 do 1192 if capacity > count then 1193 elseif capacity = 0 then 1194 new_capacity := 32 1195 storage := storage.calloc(new_capacity) 1196 capacity := new_capacity 1197 else 1198 new_capacity := 2 * capacity 1199 storage := storage.realloc(capacity, new_capacity) 1200 capacity := new_capacity 1201 end 1202 if unicode >= 65536 then 1203 -- stored as high and low surrogate 1204 low_surrogate_indexes.add_last(count) 1205 low_surrogate_values.add_last((unicode & 0x000003FF).to_integer_16) 1206 storage.put((unicode #// 1024 - 64).low_16, count) 1207 else 1208 storage.put(unicode.low_16, count) 1209 end 1210 count := count + 1 1211 ensure 1212 count = 1 + old count 1213 item(count) = unicode 1214 end 1215 1216 to_lower is 1217 -- Convert all characters to lower case. 1218 -- 1219 -- See also `to_upper', `as_lower', `as_upper'. 1220 do 1221 not_yet_implemented 1222 end 1223 1224 to_upper is 1225 -- Convert all characters to upper case. 1226 -- 1227 -- See also `to_lower', `as_upper', `as_lower'. 1228 do 1229 not_yet_implemented 1230 end 1231 1232 as_lower: like Current is 1233 -- New object with all letters in lower case. 1234 -- 1235 -- See also `as_upper', `to_lower', `to_upper'. 1236 do 1237 create Result.copy(Current) 1238 Result.to_lower 1239 end 1240 1241 as_upper: like Current is 1242 -- New object with all letters in upper case. 1243 -- 1244 -- See also `as_lower', `to_upper', `to_lower'. 1245 do 1246 create Result.copy(Current) 1247 Result.to_upper 1248 end 1249 1250 keep_head (n: INTEGER) is 1251 -- Remove all characters except for the first `n'. 1252 -- Do nothing if `n' >= `count'. 1253 -- 1254 -- See also `keep_tail', `remove_head', `remove_tail'. 1255 require 1256 n_non_negative: n >= 0 1257 do 1258 if n < count then 1259 remove_tail(count - n) 1260 end 1261 ensure 1262 count = n.min(old count) 1263 end 1264 1265 keep_tail (n: INTEGER) is 1266 -- Remove all characters except for the last `n'. 1267 -- Do nothing if `n' >= `count'. 1268 -- 1269 -- See also `keep_head', `remove_tail', `remove_head'. 1270 require 1271 n_non_negative: n >= 0 1272 do 1273 if n < count then 1274 remove_head(count - n) 1275 end 1276 ensure 1277 count = n.min(old count) 1278 end 1279 1280 remove_first is 1281 -- Remove the `first' item. 1282 -- 1283 -- See also `remove_head', `remove_last', `remove'. 1284 require 1285 not is_empty 1286 do 1287 --*** May be improved? 1288 remove_between(1, 1) 1289 ensure 1290 count = old count - 1 1291 end 1292 1293 remove_head (n: INTEGER) is 1294 -- Remove `n' first characters. If `n' >= `count', remove all. 1295 -- 1296 -- See also `remove_tail', `remove', `remove_the_first'. 1297 require 1298 n_non_negative: n >= 0 1299 do 1300 if n > count then 1301 count := 0 1302 low_surrogate_indexes.make(0) 1303 low_surrogate_values.make(0) 1304 else 1305 if n > 0 then 1306 remove_between(1, n) 1307 end 1308 end 1309 ensure 1310 count = (old count - n).max(0) 1311 end 1312 1313 remove_last is 1314 -- Remove the `last' item. 1315 -- 1316 -- See also `remove_tail', `remove_first', `remove'. 1317 require 1318 not is_empty 1319 do 1320 --*** May be improved 1321 remove_tail(1) 1322 ensure 1323 count = old count - 1 1324 end 1325 1326 remove_tail (n: INTEGER) is 1327 -- Remove `n' last characters. If `n' >= `count', remove all. 1328 -- 1329 -- See also `remove_head', `remove', `remove_the_last'. 1330 require 1331 n_non_negative: n >= 0 1332 local 1333 i: INTEGER 1334 do 1335 if n > count then 1336 count := 0 1337 low_surrogate_indexes.make(0) 1338 low_surrogate_values.make(0) 1339 else 1340 count := count - n 1341 i := low_surrogate_position(count + 1) 1342 low_surrogate_indexes.resize(i) 1343 low_surrogate_values.resize(i) 1344 end 1345 ensure 1346 count = (old count - n).max(0) 1347 end 1348 1349 remove_substring, remove_between (start_index, end_index: INTEGER) is 1350 -- Remove all characters from `strt_index' to `end_index' inclusive. 1351 require 1352 valid_start_index: 1 <= start_index 1353 valid_end_index: end_index <= count 1354 meaningful_interval: start_index <= end_index + 1 1355 local 1356 i, k, len: INTEGER 1357 do 1358 len := end_index - start_index + 1 1359 if len > 0 then 1360 from 1361 i := low_surrogate_position(start_index) 1362 k := low_surrogate_position(end_index + 1) 1363 until 1364 k > low_surrogate_indexes.upper 1365 loop 1366 low_surrogate_indexes.put(low_surrogate_indexes.item(k) - len, i) 1367 low_surrogate_values.put(low_surrogate_values.item(k), i) 1368 k := k + 1 1369 i := i + 1 1370 end 1371 low_surrogate_indexes.resize(i) 1372 low_surrogate_values.resize(i) 1373 storage.slice_copy(start_index - 1, storage, end_index, count - 1) 1374 count := count - len 1375 end 1376 ensure 1377 count = old count - (end_index - start_index + 1) 1378 end 1379 1380 remove_suffix (s: UNICODE_STRING) is 1381 -- Remove the suffix `s' of current string. 1382 -- 1383 -- See also `remove_prefix', `remove_tail', `remove'. 1384 require 1385 has_suffix(s) 1386 do 1387 not_yet_implemented 1388 -- remove_last(s.count); equal sequence may have different size 1389 ensure 1390 (old Current.twin).is_equal(Current + old s.twin) 1391 end 1392 1393 remove_prefix (s: UNICODE_STRING) is 1394 -- Remove the prefix `s' of current string. 1395 -- 1396 -- See also `remove_suffix', `remove_head', `remove'. 1397 require 1398 has_prefix(s) 1399 do 1400 not_yet_implemented 1401 -- remove_head(s.count); equal sequence may have different size 1402 ensure 1403 (old Current.twin).is_equal(old s.twin + Current) 1404 end 1405 1406 left_adjust is 1407 -- Remove leading blanks. 1408 -- 1409 -- See also `remove_head', `first'. 1410 local 1411 i: INTEGER 1412 do 1413 from 1414 i := 1 1415 until 1416 i > count or else not is_space(item(i -- not_yet_implemented -- handle combining characters 1417 )) 1418 loop 1419 i := i + 1 1420 end 1421 remove_head(i - 1) 1422 ensure 1423 -- not_yet_implemented -- handle combining characters 1424 stripped: is_empty or else not is_space(first) 1425 end 1426 1427 right_adjust is 1428 -- Remove trailing blanks. 1429 -- 1430 -- See also `remove_tail', `last'. 1431 local 1432 i: INTEGER 1433 do 1434 from 1435 until 1436 count = 0 or else not is_space(item(count -- not_yet_implemented -- handle combining characters 1437 )) 1438 loop 1439 count := count - 1 1440 end 1441 i := low_surrogate_position(count + 1) 1442 low_surrogate_indexes.resize(i) 1443 low_surrogate_values.resize(i) 1444 ensure 1445 -- not_yet_implemented -- handle combining characters 1446 stripped: is_empty or else not is_space(last) 1447 end 1448 1449feature {ANY} -- Printing: 1450 out_in_tagged_out_memory is 1451 do 1452 utf8_encode_in(tagged_out_memory) 1453 end 1454 1455 fill_tagged_out_memory is 1456 do 1457 tagged_out_memory.append(once "count: ") 1458 count.append_in(tagged_out_memory) 1459 tagged_out_memory.append(once "capacity: ") 1460 capacity.append_in(tagged_out_memory) 1461 tagged_out_memory.append(once "storage: %"") 1462 utf8_encode_in(tagged_out_memory) 1463 tagged_out_memory.append_character('%"') 1464 end 1465 1466feature {ANY} -- Other features: 1467 first: UNICODE_CHARACTER is 1468 -- The first character. 1469 -- 1470 -- See also `last', `item'. 1471 do 1472 Result.set(g_utf8_get_char(handle)) 1473 end 1474 1475 last: INTEGER is 1476 -- The last character. 1477 -- 1478 -- See also `first', `item'. 1479 do 1480 end 1481 1482 substring (a_start_index, an_end_index: like lower): like Current is 1483 -- New string consisting of items [`start_index'.. `end_index']. 1484 -- 1485 -- See also `substring_index' and `copy_substring' to save memory. 1486 require 1487 valid_start_index: 1 <= start_index 1488 valid_end_index: end_index <= count 1489 meaningful_interval: start_index <= end_index + 1 1490 local 1491 location, end_point: POINTER 1492 do 1493 create Result.with_capacity(end_index - start_index + 1) 1494 from location:=g_utf8_offset_to_pointer(handle,a_start_index.to_integer_32) 1495 until 1496 loop 1497 end 1498 ensure 1499 substring_count: Result.count = end_index - start_index + 1 1500 end 1501 1502 extend_multiple (unicode: INTEGER; n: INTEGER) is 1503 -- Extend Current with `n' times character `unicode'. 1504 require 1505 n >= 0 1506 valid_unicode_value: valid_unicode(unicode) 1507 local 1508 i: INTEGER 1509 do 1510 from 1511 i := n 1512 until 1513 i = 0 1514 loop 1515 append_character(unicode) 1516 i := i - 1 1517 end 1518 ensure 1519 count = n + old count 1520 end 1521 1522 precede_multiple (unicode: INTEGER; n: INTEGER) is 1523 -- Prepend `n' times character `unicode' to Current. 1524 require 1525 n >= 0 1526 valid_unicode_value: valid_unicode(unicode) 1527 local 1528 i: INTEGER 1529 do 1530 if n > 0 then 1531 if count = 0 then 1532 extend_multiple(unicode, n) 1533 else 1534 --|*** May be implemented in a more efficient way... 1535 from 1536 i := n 1537 until 1538 i = 0 1539 loop 1540 precede(unicode) 1541 i := i - 1 1542 end 1543 end 1544 end 1545 ensure 1546 count = n + old count 1547 end 1548 1549 extend_to_count (unicode: INTEGER; needed_count: INTEGER) is 1550 -- Extend Current with `unicode' until `needed_count' is reached. 1551 -- Do nothing if `needed_count' is already greater or equal 1552 -- to `count'. 1553 require 1554 needed_count >= 0 1555 valid_unicode_value: valid_unicode(unicode) 1556 do 1557 if needed_count > count then 1558 extend_multiple(unicode, needed_count - count) 1559 end 1560 ensure 1561 count >= needed_count 1562 end 1563 1564 precede_to_count (unicode: INTEGER; needed_count: INTEGER) is 1565 -- Prepend `unicode' to Current until `needed_count' is reached. 1566 -- Do nothing if `needed_count' is already greater or equal 1567 -- to `count'. 1568 require 1569 needed_count >= 0 1570 valid_unicode_value: valid_unicode(unicode) 1571 do 1572 if needed_count > count then 1573 precede_multiple(unicode, needed_count - count) 1574 end 1575 ensure 1576 count >= needed_count 1577 end 1578 1579 reverse is 1580 -- Reverse the string. 1581 local 1582 i1, i2: INTEGER 1583 do 1584 not_yet_implemented 1585 --|*** reverse grapheme 1586 from 1587 i1 := 1 1588 i2 := count 1589 until 1590 i1 >= i2 1591 loop 1592 swap(i1, i2) 1593 i1 := i1 + 1 1594 i2 := i2 - 1 1595 end 1596 end 1597 1598 remove_all_occurrences (unicode: INTEGER) is 1599 -- Remove all occurrences of `unicode'. 1600 -- 1601 -- See also `occurrences', `remove'. 1602 require 1603 valid_unicode_value: valid_unicode(unicode) 1604 local 1605 i: INTEGER 1606 do 1607 --|*** May be implemented in a more efficient way... 1608 from 1609 i := index_of(unicode, 1) 1610 until 1611 i = 0 1612 loop 1613 remove(i) 1614 i := index_of(unicode, i) 1615 end 1616 ensure 1617 count = old count - old occurrences(unicode) 1618 end 1619 1620 substring_index (other: UNICODE_STRING; start_index: INTEGER): INTEGER is 1621 -- Position of first occurrence of `other' at or after `start', 0 if none. 1622 -- 1623 -- See also `substring', `first_substring_index'. 1624 require 1625 other_not_void: other /= Void 1626 valid_start_index: start_index >= 1 and start_index <= count + 1 1627 do 1628 not_yet_implemented 1629 end 1630 1631 first_substring_index (other: UNICODE_STRING): INTEGER is 1632 -- Position of first occurrence of `other' at or after 1, 0 if none. 1633 -- 1634 -- See also `substring_index'. 1635 require 1636 other_not_void: other /= Void 1637 do 1638 Result := substring_index(other, 1) 1639 ensure 1640 definition: Result = substring_index(other, 1) 1641 end 1642 1643feature {ANY} -- Splitting a STRING: 1644 split: ARRAY[UTF8_STRING] is 1645 -- Split the string into an array of words. Uses `is_separator' 1646 -- to find words. Gives Void or a non empty array. 1647 -- 1648 -- See also `split_in'. 1649 do 1650 if count > 0 then 1651 split_buffer.clear_count 1652 split_in(split_buffer) 1653 if not split_buffer.is_empty then 1654 Result := split_buffer.twin 1655 end 1656 end 1657 ensure 1658 Result /= Void implies not Result.is_empty 1659 end 1660 1661 split_in (words: COLLECTION[UTF_STRING]) is 1662 -- Same jobs as `split' but result is appended in `words'. 1663 -- 1664 -- See also `split'. 1665 require 1666 words /= Void 1667 do 1668 ensure 1669 words.count >= old words.count 1670 end 1671 1672 get_new_iterator: ITERATOR[INTEGER] is 1673 do 1674 create {ITERATOR_ON_UNICODE_STRING} Result.make(Current) 1675 end 1676 1677feature {} -- Implementation 1678 split_buffer: ARRAY[UTF8_STRING] is 1679 once 1680 create Result.with_capacity(4, 1) 1681 end 1682 1683feature {} 1684 -- TODO: In UNICODE_STRING we have "manifest_initialize (c: like capacity; s: like storage; ls_cap: INTEGER; lsv: NATIVE_ARRAY[INTEGER_16]; lsi: NATIVE_ARRAY[INTEGER]) is -- This function is a compiler-hook automatically called when -- a manifest unicode string (i.e. U"foo") is used in the Eiffel -- source code." Provide an UTF8 equivalent 1685 1686-- g_utf8_next_char() 1687 1688-- #define g_utf8_next_char(p) 1689 1690-- Skips to the next character in a UTF-8 string. The string must be valid; this 1691-- macro is as fast as possible, and has no error-checking. You would use this macro 1692-- to iterate over a string character by character. The macro returns the start of 1693-- the next UTF-8 character. Before using this macro, use g_utf8_validate() to 1694-- validate strings that may contain invalid UTF-8. 1695 1696-- p : Pointer to the start of a valid UTF-8 character. 1697 1698-- --------------------------------------------------------------------------------- 1699 1700-- g_utf8_get_char () 1701 1702-- gunichar g_utf8_get_char (const gchar *p); 1703 1704-- Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does 1705-- not point to a valid UTF-8 encoded character, results are undefined. If you are 1706-- not sure that the bytes are complete valid Unicode characters, you should use 1707-- g_utf8_get_char_validated() instead. 1708 1709-- p : a pointer to Unicode character encoded as UTF-8 1710-- Returns : the resulting character 1711 1712-- --------------------------------------------------------------------------------- 1713 1714-- g_utf8_get_char_validated () 1715 1716-- gunichar g_utf8_get_char_validated (const gchar *p, 1717-- gssize max_len); 1718 1719-- Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This 1720-- function checks for incomplete characters, for invalid characters such as 1721-- characters that are out of the range of Unicode, and for overlong encodings of 1722-- valid characters. 1723 1724-- p : a pointer to Unicode character encoded as UTF-8 1725-- max_len : the maximum number of bytes to read, or -1, for no maximum. 1726-- Returns : the resulting character. If p points to a partial sequence at the end 1727-- of a string that could begin a valid character, returns (gunichar)-2; 1728-- otherwise, if p does not point to a valid UTF-8 encoded Unicode 1729-- character, returns (gunichar)-1. 1730 1731-- --------------------------------------------------------------------------------- 1732 1733-- g_utf8_offset_to_pointer () 1734 1735-- gchar* g_utf8_offset_to_pointer (const gchar *str, 1736-- glong offset); 1737 1738-- Converts from an integer character offset to a pointer to a position within the 1739-- string. 1740 1741-- Since 2.10, this function allows to pass a negative offset to step backwards. It 1742-- is usually worth stepping backwards from the end instead of forwards if offset is 1743-- in the last fourth of the string, since moving forward is about 3 times faster 1744-- than moving backward. 1745 1746-- str : a UTF-8 encoded string 1747-- offset : a character offset within str 1748-- Returns : the resulting pointer 1749 1750-- --------------------------------------------------------------------------------- 1751 1752-- g_utf8_pointer_to_offset () 1753 1754-- glong g_utf8_pointer_to_offset (const gchar *str, 1755-- const gchar *pos); 1756 1757-- Converts from a pointer to position within a string to a integer character 1758-- offset. 1759 1760-- Since 2.10, this function allows pos to be before str, and returns a negative 1761-- offset in this case. 1762 1763-- str : a UTF-8 encoded string 1764-- pos : a pointer to a position within str 1765-- Returns : the resulting character offset 1766 1767-- --------------------------------------------------------------------------------- 1768 1769-- g_utf8_prev_char () 1770 1771-- gchar* g_utf8_prev_char (const gchar *p); 1772 1773-- Finds the previous UTF-8 character in the string before p. 1774 1775-- p does not have to be at the beginning of a UTF-8 character. No check is made to 1776-- see if the character found is actually valid other than it starts with an 1777-- appropriate byte. If p might be the first character of the string, you must use 1778-- g_utf8_find_prev_char() instead. 1779 1780-- p : a pointer to a position within a UTF-8 encoded string 1781-- Returns : a pointer to the found character. 1782 1783-- --------------------------------------------------------------------------------- 1784 1785-- g_utf8_find_next_char () 1786 1787-- gchar* g_utf8_find_next_char (const gchar *p, 1788-- const gchar *end); 1789 1790-- Finds the start of the next UTF-8 character in the string after p. 1791 1792-- p does not have to be at the beginning of a UTF-8 character. No check is made to 1793-- see if the character found is actually valid other than it starts with an 1794-- appropriate byte. 1795 1796-- p : a pointer to a position within a UTF-8 encoded string 1797-- end : a pointer to the end of the string, or NULL to indicate that the string 1798-- is nul-terminated, in which case the returned value will be 1799-- Returns : a pointer to the found character or NULL 1800 1801-- --------------------------------------------------------------------------------- 1802 1803-- g_utf8_find_prev_char () 1804 1805-- gchar* g_utf8_find_prev_char (const gchar *str, 1806-- const gchar *p); 1807 1808-- Given a position p with a UTF-8 encoded string str, find the start of the 1809-- previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters 1810-- are present in str before p. 1811 1812-- p does not have to be at the beginning of a UTF-8 character. No check is made to 1813-- see if the character found is actually valid other than it starts with an 1814-- appropriate byte. 1815 1816-- str : pointer to the beginning of a UTF-8 encoded string 1817-- p : pointer to some position within str 1818-- Returns : a pointer to the found character or NULL. 1819 1820-- --------------------------------------------------------------------------------- 1821 1822-- g_utf8_strlen () 1823 1824-- glong g_utf8_strlen (const gchar *p, 1825-- gssize max); 1826 1827-- Returns the length of the string in characters. 1828 1829-- p : pointer to the start of a UTF-8 encoded string. 1830-- max : the maximum number of bytes to examine. If max is less than 0, then the 1831-- string is assumed to be nul-terminated. If max is 0, p will not be 1832-- examined and may be NULL. 1833-- Returns : the length of the string in characters 1834 1835-- --------------------------------------------------------------------------------- 1836 1837-- g_utf8_strncpy () 1838 1839-- gchar* g_utf8_strncpy (gchar *dest, 1840-- const gchar *src, 1841-- gsize n); 1842 1843-- Like the standard C strncpy() function, but copies a given number of characters 1844-- instead of a given number of bytes. The src string must be valid UTF-8 encoded 1845-- text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility 1846-- functions with it.) 1847 1848-- dest : buffer to fill with characters from src 1849-- src : UTF-8…
Large files files are truncated, but you can click here to view the full file