/src/lib/string/unicode_string.e
Specman e | 1920 lines | 1489 code | 108 blank | 323 comment | 88 complexity | ae54c695b3e9b74c411709b56358f4d8 MD5 | raw file
1-- This file is part of a Liberty Eiffel library. 2-- See the full copyright at the end. 3-- 4class UNICODE_STRING 5 -- 6 -- WARNING: THIS CLASS IS A WORK IN PROGRESS. SOME FEATURE ARE NOT 7 -- YET IMPLEMENTED AND SOME FEATURE MAY APPEAR/DISAPPEAR. 8 -- 9 -- A UNICODE_STRING is a resizable string written with unicode values. 10 -- From unicode.org: "Unicode provides a unique number for every 11 -- character , 12 -- no matter what the platform, 13 -- no matter what the program, 14 -- no matter what the language. 15 -- 16 -- WARNING: a grapheme may be described with many code. 17 -- grapheme may be defined as "user character". Angstrom sign 18 -- one grapheme but may be defined using (LETTER A + COMBINING RING). 19 -- Unicode strings may be acceded in two ways: 20 -- - low-level (code by code) 21 -- - high-level (grapheme by grapheme) 22 -- 23 -- Unless otherwise specified, all functions unit is the unicode number. 24 -- 25 26inherit 27 HASHABLE 28 redefine copy, out_in_tagged_out_memory, fill_tagged_out_memory 29 end 30 COMPARABLE 31 redefine is_equal, copy, compare, three_way_comparison, out_in_tagged_out_memory, fill_tagged_out_memory 32 end 33 TRAVERSABLE[INTEGER] 34 redefine is_equal, copy, out_in_tagged_out_memory, fill_tagged_out_memory, next_generation 35 end 36 SEARCHABLE[INTEGER] 37 redefine is_equal, copy, out_in_tagged_out_memory, fill_tagged_out_memory 38 end 39 RECYCLABLE 40 redefine is_equal, copy, out_in_tagged_out_memory, fill_tagged_out_memory 41 end 42 43insert 44 UNICODE_STRING_HELPER 45 redefine is_equal, copy, out_in_tagged_out_memory, fill_tagged_out_memory 46 end 47 48create {ANY} 49 make, copy, make_empty, make_filled, from_utf8 50 51feature {UNICODE_STRING, UNICODE_STRING_HANDLER} 52 storage: NATIVE_ARRAY[INTEGER_16] 53 -- The place where characters are stored. 54 -- WARNING: it's only `storage' area. Each Unicode value 55 -- stored using 2 bytes (CHARACTER). Encoding used is UTF-16NE. 56 -- low surrogates are stored in other way for direct access. 57 58feature {ANY} 59 count: INTEGER 60 -- String length which is also the maximum valid index. 61 -- 62 -- See also `is_empty', `lower', `upper'. 63 64 capacity: INTEGER 65 -- Capacity of the `storage' area. 66 67 lower: INTEGER 1 68 -- Minimum index; actually, this is always 1 (this feature 69 -- here to mimic the one of the COLLECTION hierarchy). 70 -- 71 -- See also `upper', `valid_index', `item'. 72 73 upper: INTEGER 74 -- Maximum index; actually the same value as `count' (this 75 -- feature is here to mimic the one of the COLLECTION hierarchy). 76 -- 77 -- See also `lower', `valid_index', `item'. 78 do 79 Result := count 80 ensure 81 Result = count 82 end 83 84feature {ANY} -- Creation / Modification: 85 make (needed_capacity: INTEGER) 86 -- Initialize the string to have at least `needed_capacity' 87 -- characters of storage. 88 require 89 non_negative_size: needed_capacity >= 0 90 do 91 if needed_capacity > 0 then 92 if capacity < needed_capacity then 93 storage := storage.calloc(needed_capacity) 94 capacity := needed_capacity 95 end 96 end 97 count := 0 98 if low_surrogate_indexes = Void then 99 create low_surrogate_indexes.make(0) 100 create low_surrogate_values.make(0) 101 else 102 low_surrogate_indexes.make(0) 103 low_surrogate_values.make(0) 104 end 105 next_generation 106 ensure 107 needed_capacity <= capacity 108 empty_string: count = 0 109 end 110 111 make_empty 112 -- Create an empty string. 113 do 114 make(0) 115 end 116 117 make_filled (unicode: INTEGER; n: INTEGER) 118 -- Initialize string with `n' copies of `unicode'. 119 require 120 valid_count: n >= 0 121 valid_unicode_value: valid_unicode(unicode) 122 do 123 make(n) 124 count := n 125 fill_with(unicode) 126 ensure 127 count_set: count = n 128 filled: occurrences(unicode) = count 129 end 130 131feature {ANY} -- Testing: 132 is_empty: BOOLEAN 133 -- Has string length 0? 134 -- 135 -- See also `count'. 136 do 137 Result := count = 0 138 end 139 140 item (i: INTEGER): INTEGER 141 -- Get unicode at position `i'. 142 -- 143 -- See also `lower', `upper', `valid_index', `put'. 144 local 145 n: INTEGER 146 do 147 n := storage.item(i - 1) 148 if n & 0x0000F800 = 0x0000D800 then 149 Result := n & 0x000007FF + 64 150 Result := Result * 1024 + low_surrogate_value(i) 151 else 152 Result := n & 0x0000FFFF 153 end 154 end 155 156 infix "@" (i: INTEGER): INTEGER 157 -- The infix notation which is actually just a synonym for `item'. 158 -- 159 -- See also `item', `put'. 160 require 161 valid_index(i) 162 do 163 Result := item(i) 164 ensure 165 definition: Result = item(i) 166 end 167 168 hash_code: INTEGER 169 local 170 i, j: INTEGER 171 do 172 from 173 j := count 174 until 175 j <= 0 176 loop 177 Result := Result #* 5 #+ storage.item(i) 178 i := i + 1 179 j := j - 1 180 end 181 from 182 j := low_surrogate_values.upper 183 until 184 j <= low_surrogate_values.lower 185 loop 186 Result := Result #* 5 #+ low_surrogate_values.item(j) 187 j := j - 1 188 end 189 if Result < 0 then 190 Result := -(Result + 1) 191 end 192 end 193 194 infix "<" (other: like Current): BOOLEAN 195 -- Is `Current' less than `other'? 196 -- 197 -- See also `>', `<=', `>=', `min', `max'. 198 local 199 i: INTEGER; maxi: INTEGER 200 do 201 from 202 i := 1 203 maxi := count.min(other.count) 204 until 205 i > maxi or else item(i) /= other.item(i) 206 loop 207 i := i + 1 208 end 209 if i <= maxi then 210 Result := item(i) < other.item(i) 211 else 212 Result := i <= other.count 213 end 214 --not_yet_implemented 215 end 216 217 compare, three_way_comparison (other: like Current): INTEGER 218 do 219 not_yet_implemented 220 -- redefine needed ? 221 end 222 223 is_equal (other: like Current): BOOLEAN 224 -- Do both strings have the same character sequence? 225 -- 226 -- See also `same_as'. 227 do 228 if other = Current then 229 Result := True 230 else 231 if count = other.count and then low_surrogate_values.is_equal(other.low_surrogate_values) and then low_surrogate_indexes.is_equal(other.low_surrogate_indexes) then 232 Result := storage.fast_memcmp(other.storage, count) 233 end 234 if not Result then 235 --not_yet_implemented 236 end 237 end 238 end 239 240 same_as (other: UNICODE_STRING): BOOLEAN 241 -- Case insensitive `is_equal'. 242 require 243 other /= Void 244 do 245 not_yet_implemented 246 end 247 248 index_of, fast_index_of (unicode: INTEGER; start_index: INTEGER): INTEGER 249 -- Index of first occurrence of `unicode' at or after `start_index', 250 -- 0 if none. 251 -- 252 -- See also `reverse_index_of', `first_index_of', `last_index_of', `has'. 253 require 254 valid_unicode_value: valid_unicode(unicode) 255 local 256 code: INTEGER_16; remainder: INTEGER_16; i: INTEGER 257 do 258 if unicode >= 0x00010000 then 259 -- stored as high and low surrogate 260 code := (unicode #// 1024 - 64).low_16 261 remainder := (unicode & 0x000003FF).to_integer_16 262 from 263 i := 0 264 until 265 i > low_surrogate_indexes.upper or else low_surrogate_indexes.item(i) >= start_index 266 loop 267 i := i + 1 268 end 269 from 270 until 271 i > low_surrogate_indexes.upper or else low_surrogate_values.item(i) = remainder and then storage.item(low_surrogate_indexes.item(i) - 1) = code 272 loop 273 i := i + 1 274 end 275 if i <= low_surrogate_indexes.upper then 276 Result := low_surrogate_indexes.item(i) 277 end 278 else 279 -- not surrogate character 280 code := unicode.low_16 281 from 282 i := start_index - 1 283 Result := start_index 284 until 285 Result > count or else storage.item(i) = code 286 loop 287 Result := Result + 1 288 i := i + 1 289 end 290 if Result > count then 291 Result := 0 292 end 293 end 294 ensure then 295 Result /= 0 implies item(Result) = unicode 296 end 297 298 reverse_index_of, fast_reverse_index_of (unicode: INTEGER; start_index: INTEGER): INTEGER 299 -- Index of first occurrence of `unicode' at or before `start_index', 0 if none. 300 -- The search is done in reverse direction, which means from the `start_index' down 301 -- to the first character. 302 -- 303 -- See also `index_of', `last_index_of', `first_index_of'. 304 --require 305 -- valid_start_index: start_index >= 0 and start_index <= count 306 -- valid_unicode_value: valid_unicode(unicode) 307 do 308 from 309 Result := start_index 310 until 311 Result = 0 or else unicode = item(Result) 312 loop 313 Result := Result - 1 314 end 315 ensure then 316 Result /= 0 implies item(Result) = unicode 317 end 318 319 first_index_of, fast_first_index_of (unicode: INTEGER): INTEGER 320 -- Index of first occurrence of `unicode' at index 1 or after index 1. 321 -- 322 -- See also `last_index_of', `index_of', `reverse_index_of'. 323 do 324 Result := index_of(unicode, 1) 325 end 326 327 last_index_of, fast_last_index_of (unicode: INTEGER): INTEGER 328 -- Index of last occurrence of `unicode', 0 if none. 329 -- 330 -- See also `first_index_of', `reverse_index_of', `index_of'. 331 do 332 Result := reverse_index_of(unicode, upper) 333 end 334 335 has, fast_has (unicode: INTEGER): BOOLEAN 336 -- True if `unicode' is in the STRING. 337 -- 338 -- See also `index_of', `occurrences', `has_substring'. 339 require 340 valid_unicode_value: valid_unicode(unicode) 341 do 342 Result := index_of(unicode, 1) /= 0 343 end 344 345 has_substring (other: UNICODE_STRING): BOOLEAN 346 -- True if `Current' contains `other'. 347 -- 348 -- See also `substring_index', `has'. 349 require 350 other_not_void: other /= Void 351 do 352 Result := substring_index(other, 1) /= 0 353 end 354 355 occurrences (unicode: INTEGER): INTEGER 356 -- Number of times character `unicode' appears in the string. 357 -- 358 -- See also `remove_all_occurrences', `has'. 359 require 360 valid_unicode_value: valid_unicode(unicode) 361 local 362 i: INTEGER 363 do 364 from 365 i := index_of(unicode, 1) 366 until 367 i = 0 368 loop 369 Result := Result + 1 370 i := index_of(unicode, i) 371 end 372 ensure 373 Result >= 0 374 end 375 376 has_suffix (s: UNICODE_STRING): BOOLEAN 377 -- True if suffix of `Current' is `s'. 378 -- 379 -- See also `remove_suffix', `has_prefix', `has_substring'. 380 require 381 s /= Void 382 local 383 i, offset: INTEGER 384 do 385 offset := count - s.count 386 from 387 Result := offset >= 0 388 i := lower 389 until 390 not Result or else i > s.upper 391 loop 392 Result := item(i + offset) = s.item(i) 393 i := i + 1 394 end 395 end 396 397 has_prefix (p: UNICODE_STRING): BOOLEAN 398 -- True if prefix of `Current' is `p'. 399 require 400 p /= Void 401 local 402 i: INTEGER 403 do 404 from 405 Result := count >= p.count 406 i := lower 407 until 408 not Result or else i > p.upper 409 loop 410 Result := item(i) = p.item(i) 411 i := i + 1 412 end 413 end 414 415feature {ANY} -- Testing and Conversion: 416 is_ascii: BOOLEAN 417 -- True if all unicode value is in range 0..127 418 local 419 i: INTEGER 420 do 421 from 422 i := count - 1 423 until 424 i < 0 or else storage.item(i) & 0xFF80 /= 0 425 loop 426 i := i - 1 427 end 428 Result := i < 0 429 end 430 431 to_utf8: STRING 432 -- New string is created, current unicode string is encoded 433 -- with UTF-8 format. 434 -- 435 -- See also: `utf8_encode_in' and `as_utf8' to save memory. 436 do 437 tmp_buffer.clear_count 438 utf8_encode_in(tmp_buffer) 439 Result := tmp_buffer.twin 440 end 441 442 as_utf8: STRING 443 -- Encode the string in UTF-8. Always returns the same once object. 444 -- 445 -- See also: `to_utf8', `utf8_encode_in'. 446 do 447 Result := once "" 448 Result.clear_count 449 utf8_encode_in(Result) 450 end 451 452 utf8_encode_in (s: STRING) 453 -- Append the string in UTF-8 to `s'. 454 -- 455 -- See also: `to_utf8', `as_utf8'. 456 require 457 s /= Void 458 local 459 i: INTEGER 460 do 461 from 462 i := 1 463 until 464 i > count 465 loop 466 utf8_character_in(item(i), s) 467 i := i + 1 468 end 469 end 470 471 utf16be_encode_in (s: STRING) 472 -- Append the string in UTF-16BE to `s' 473 require 474 s /= Void 475 local 476 i, k: INTEGER; v: INTEGER_16 477 do 478 from 479 until 480 i >= count 481 loop 482 v := storage.item(i) 483 s.extend((v |>>> 8).to_character) 484 s.extend((v & 0x00FF).to_character) 485 if v & 0xF800 = 0xD800 then 486 check 487 low_surrogate_indexes.item(k) = i + 1 488 end 489 s.extend((low_surrogate_values.item(k) #// 256 + 220).to_character) 490 s.extend((low_surrogate_values.item(k) & 0x00FF).to_character) 491 k := k + 1 492 end 493 i := i + 1 494 end 495 end 496 497 utf8_decode_from (s: ABSTRACT_STRING): BOOLEAN 498 -- Use `s' as UTF-8 format encoded unicode string 499 -- Return `False' if decoding process failed 500 require 501 s /= Void 502 local 503 i, k, seq_length: INTEGER; v: INTEGER 504 do 505 from 506 Result := True 507 i := 1 508 until 509 i > s.count 510 loop 511 v := s.item(i).code 512 i := i + 1 513 inspect 514 v 515 when 0 .. 127 then 516 extend(v) 517 k := 0 518 when 192 .. 223 then 519 v := v - 192 520 k := 2 521 when 224 .. 239 then 522 v := v - 224 523 k := 3 524 when 240 .. 247 then 525 v := v - 240 526 k := 4 527 else 528 extend(65533) 529 Result := False 530 k := 0 531 end 532 from 533 seq_length := k 534 until 535 k <= 1 536 loop 537 if i <= s.count and then s.item(i).code.in_range(128, 191) then 538 v := v * 64 + s.item(i).code - 128 539 i := i + 1 540 k := k - 1 541 else 542 extend(65533) 543 Result := False 544 k := 0 545 end 546 end 547 if k = 1 then 548 if v < 128 or else v < 2048 and then seq_length > 2 or else v < 65536 and then seq_length > 3 then 549 -- overlong sequence, must be refused by any UTF-8 550 -- compliant decode for security reasons. 551 extend(65533) 552 Result := False 553 elseif not valid_unicode(v) then 554 extend(65533) 555 Result := False 556 else 557 extend(v) 558 end 559 end 560 end 561 end 562 563feature {} 564 from_utf8 (s: ABSTRACT_STRING) 565 -- Use `s' as UTF-8 format encoded unicode string 566 -- This function may be used for manifest strings 567 -- See `utf8_decode_from' for error detection 568 require 569 s /= Void 570 local 571 error: BOOLEAN 572 do 573 make(s.count) 574 error := utf8_decode_from(s) 575 end 576 577feature {ANY} -- Modification: 578 resize (new_count: INTEGER) 579 -- Resize Current. When `new_count' is greater than 580 -- `count', new positions are initialized with unicode 0. 581 require 582 new_count >= 0 583 local 584 i: INTEGER 585 do 586 if new_count <= count then 587 elseif capacity < new_count then 588 if capacity = 0 then 589 storage := storage.calloc(new_count) 590 else 591 storage := storage.realloc(capacity, new_count) 592 end 593 capacity := new_count 594 else 595 storage.clear(count, new_count - 1) 596 end 597 count := new_count 598 from 599 i := low_surrogate_indexes.upper 600 until 601 i < 0 or else low_surrogate_indexes.item(i) <= new_count 602 loop 603 --TODO: only one remove out of the loop 604 low_surrogate_indexes.remove_last 605 low_surrogate_values.remove_last 606 i := i - 1 607 end 608 next_generation 609 ensure 610 count = new_count 611 capacity >= old capacity 612 end 613 614 clear_count, wipe_out 615 -- Discard all characters so that `is_empty' is True after that call. 616 -- The internal `capacity' is not changed by this call (i.e. the internal `storage' memory 617 -- neither released nor shrunk). 618 -- 619 -- See also `clear_count_and_capacity'. 620 do 621 count := 0 622 low_surrogate_indexes.make(0) 623 low_surrogate_values.make(0) 624 next_generation 625 ensure 626 is_empty: count = 0 627 capacity = old capacity 628 end 629 630 clear_count_and_capacity 631 -- Discard all characters (`is_empty' is True after that call). The internal `capacity' may also be 632 -- reduced after this call. 633 -- 634 -- See also `clear_count'. 635 do 636 low_surrogate_indexes.clear_count_and_capacity 637 low_surrogate_values.clear_count_and_capacity 638 clear_count 639 --*** capacity := 0 640 --*** storage := null_storage 641 next_generation 642 ensure 643 is_empty: count = 0 644 capacity = 0 645 end 646 647 copy (other: like Current) 648 -- Copy `other' onto Current. 649 -- 650 -- See also `copy_substring'. 651 do 652 count := other.count 653 if count > 0 then 654 if capacity < count then 655 storage := storage.calloc(count) 656 capacity := count 657 end 658 storage.copy_from(other.storage, count - 1) 659 end 660 if low_surrogate_indexes = Void then 661 create low_surrogate_indexes.make(0) 662 create low_surrogate_values.make(0) 663 end 664 low_surrogate_indexes.copy(other.low_surrogate_indexes) 665 low_surrogate_values.copy(other.low_surrogate_values) 666 next_generation 667 ensure then 668 count = other.count 669 end 670 671 copy_substring (s: like Current; start_index, end_index: INTEGER) 672 -- Copy the substring from `s' from `start_index' to `end_index' 673 -- to Current. 674 -- 675 -- See also `copy'. 676 --|*** DUMB IMPLEMENTATION 677 require 678 string_not_void: s /= Void 679 valid_start_index: 1 <= start_index 680 valid_end_index: end_index <= s.count 681 meaningful_interval: start_index <= end_index + 1 682 do 683 clear_count 684 append_substring(s, start_index, end_index) 685 end 686 687 fill_with (unicode: INTEGER) 688 -- Replace every unicode with the new value. 689 require 690 valid_unicode_value: valid_unicode(unicode) 691 local 692 i: INTEGER; code: INTEGER_16; remainder: INTEGER_16 693 do 694 if unicode >= 65536 then 695 -- stored as high and low surrogate 696 code := (unicode #// 1024 - 64).low_16 697 remainder := (unicode & 0x000003FF).to_integer_16 --unicode #\\ 1024 698 storage.set_all_with(code, count - 1) 699 low_surrogate_values.resize(count) 700 low_surrogate_values.set_all_with(remainder) 701 from 702 i := count - 1 703 low_surrogate_indexes.resize(count) 704 until 705 i < 0 706 loop 707 low_surrogate_indexes.put(i + 1, i) 708 i := i - 1 709 end 710 else 711 code := unicode.low_16 712 storage.set_all_with(code, count - 1) 713 low_surrogate_values.resize(0) 714 low_surrogate_indexes.resize(0) 715 end 716 next_generation 717 ensure 718 occurrences(unicode) = count 719 end 720 721 replace_all (old_code, new_code: like item) 722 -- Replace all occurrences of the element `old_code' by `new_code'. 723 require 724 valid_unicode_value: valid_unicode(old_code) 725 valid_unicode_value: valid_unicode(new_code) 726 local 727 i: INTEGER 728 do 729 --*** May be implemented in a more efficient way... 730 if old_code /= new_code then 731 from 732 i := index_of(old_code, 1) 733 until 734 i = 0 735 loop 736 put(new_code, i) 737 i := index_of(old_code, i + 1) 738 end 739 end 740 next_generation 741 ensure 742 count = old count 743 old_code /= new_code implies occurrences(old_code) = 0 744 end 745 746 append, append_string (s: UNICODE_STRING) 747 -- Append a copy of 's' to `Current'. 748 -- 749 -- See also `add_last', `add_first', `prepend', '+'. 750 require 751 s_not_void: s /= Void 752 local 753 s_count, needed_capacity, new_capacity, i: INTEGER; indexes: FAST_ARRAY[INTEGER] 754 do 755 s_count := s.count 756 needed_capacity := count + s_count 757 if needed_capacity > capacity then 758 if capacity = 0 then 759 storage := storage.calloc(needed_capacity) 760 capacity := needed_capacity 761 else 762 new_capacity := (2 * capacity).max(needed_capacity) 763 storage := storage.realloc(capacity, new_capacity) 764 capacity := new_capacity 765 end 766 end 767 storage.copy_at(count, s.storage, s_count) 768 from 769 indexes := s.low_surrogate_indexes 770 until 771 i > indexes.upper 772 loop 773 low_surrogate_indexes.add_last(indexes.item(i) + count) 774 low_surrogate_values.add_last(s.low_surrogate_values.item(i)) 775 i := i + 1 776 end 777 count := needed_capacity 778 next_generation 779 end 780 781 append_substring (s: like Current; start_index, end_index: INTEGER) 782 -- Append the substring from `s' from `start_index' to `end_index' 783 -- to Current. 784 --|*** DUMB IMPLEMENTATION 785 require 786 string_not_void: s /= Void 787 valid_start_index: 1 <= start_index 788 valid_end_index: end_index <= s.count 789 meaningful_interval: start_index <= end_index + 1 790 local 791 i: INTEGER 792 do 793 from 794 i := start_index 795 until 796 i > end_index 797 loop 798 extend(s.item(i)) 799 i := i + 1 800 end 801 end 802 803 prepend (other: UNICODE_STRING) 804 -- Prepend `other' to `Current'. 805 -- 806 -- See also `append'. 807 require 808 other /= Void 809 local 810 i, j, k: INTEGER 811 do 812 i := count 813 j := other.count 814 resize(i + j) 815 if i > 0 and then j > 0 then 816 storage.move(0, i - 1, j) 817 from 818 k := low_surrogate_indexes.upper 819 until 820 k < 0 821 loop 822 low_surrogate_indexes.put(low_surrogate_indexes.item(k) + j, k) 823 k := k - 1 824 end 825 end 826 -- May be implemented in a more efficient way... 827 from 828 k := other.low_surrogate_indexes.upper 829 until 830 k < 0 831 loop 832 low_surrogate_indexes.add_first(other.low_surrogate_indexes.item(k)) 833 low_surrogate_values.add_first(other.low_surrogate_values.item(k)) 834 k := k - 1 835 end 836 storage.copy_from(other.storage, j - 1) 837 next_generation 838 ensure 839 (old other.twin + old Current.twin).is_equal(Current) 840 end 841 842 insert_string (s: UNICODE_STRING; i: INTEGER) 843 -- Insert `s' at index `i', shifting characters from index `i' 844 -- to `count' rightwards. 845 require 846 string_not_void: s /= Void 847 valid_insertion_index: 1 <= i and i <= count + 1 848 local 849 j, k: INTEGER; pos, n: INTEGER 850 do 851 j := count 852 k := s.count 853 resize(j + k) 854 if i <= j then 855 storage.move(i - 1, j - 1, k) 856 end 857 storage.copy_at(i - 1, s.storage, k) 858 pos := low_surrogate_position(i) 859 j := low_surrogate_indexes.count + s.low_surrogate_indexes.count 860 low_surrogate_indexes.resize(j) 861 low_surrogate_values.resize(j) 862 from 863 -- move existing surrogates and adjust indexes 864 n := s.low_surrogate_indexes.upper 865 until 866 n < 0 867 loop 868 j := j - 1 869 low_surrogate_indexes.put(low_surrogate_indexes.item(pos + n) + k, j) 870 low_surrogate_values.put(low_surrogate_values.item(pos + n), j) 871 n := n - 1 872 end 873 from 874 -- copy surrogates from s and adjust indexes 875 n := s.low_surrogate_indexes.upper 876 j := pos + n 877 until 878 n < 0 879 loop 880 low_surrogate_indexes.put(s.low_surrogate_indexes.item(n) + i, j) 881 low_surrogate_values.put(s.low_surrogate_values.item(n), j) 882 j := j - 1 883 n := n - 1 884 end 885 next_generation 886 end 887 888 replace_substring (s: UNICODE_STRING; start_index, end_index: INTEGER) 889 -- Replace the substring from `start_index' to `end_index', 890 -- inclusive, with `s'. 891 require 892 string_not_void: s /= Void 893 valid_start_index: 1 <= start_index 894 valid_end_index: end_index <= count 895 meaningful_interval: start_index <= end_index + 1 896 do 897 -- May be implemented in a more efficient way... 898 remove_between(start_index, end_index) 899 insert_string(s, start_index) 900 end 901 902 infix "+" (other: UNICODE_STRING): like Current 903 -- Create a new UNICODE_STRING which is the concatenation of 904 -- `Current' and `other'. 905 -- 906 -- See also `append'. 907 require 908 other_exists: other /= Void 909 do 910 create Result.make(count + other.count) 911 Result.append(Current) 912 Result.append(other) 913 ensure 914 result_count: Result.count = count + other.count 915 end 916 917 put (unicode: INTEGER; i: INTEGER) 918 -- Put `unicode' at position `i'. 919 -- 920 -- See also `item', `lower', `upper', `swap'. 921 require 922 valid_index: valid_index(i) 923 valid_unicode_value: valid_unicode(unicode) 924 local 925 v, n: INTEGER 926 do 927 if unicode >= 65536 then 928 -- stored as high and low surrogate 929 v := unicode #// 1024 - 64 930 if storage.item(i - 1) & 0xF800 = 0xD800 then 931 low_surrogate_values.put((unicode & 0x000003FF).to_integer_16, low_surrogate_index(i)) 932 else 933 n := low_surrogate_position(i) 934 low_surrogate_indexes.add_last(0) 935 low_surrogate_values.add_last(0) 936 if n /= low_surrogate_indexes.upper then 937 low_surrogate_indexes.move(n, low_surrogate_indexes.upper - 1, 1) 938 low_surrogate_values.move(n, low_surrogate_values.upper - 1, 1) 939 end 940 low_surrogate_indexes.put(i, n) 941 low_surrogate_values.put((unicode & 0x000003FF).to_integer_16, n) 942 end 943 storage.put(v.low_16, i - 1) 944 else 945 if storage.item(i - 1) & 0xF800 = 0xD800 then 946 v := low_surrogate_index(i) 947 low_surrogate_indexes.remove(v) 948 low_surrogate_values.remove(v) 949 end 950 storage.put(unicode.low_16, i - 1) 951 end 952 next_generation 953 ensure 954 item(i) = unicode 955 end 956 957 swap (i1, i2: INTEGER) 958 -- Swap two characters. 959 -- 960 -- See also `item', `put'. 961 require 962 valid_index(i1) 963 valid_index(i2) 964 local 965 tmp: INTEGER_16; j1, j2: INTEGER; low_tmp: INTEGER_16; k1, k2: INTEGER 966 do 967 j1 := i1 - 1 968 j2 := i2 - 1 969 tmp := storage.item(j1) 970 if tmp & 0xF800 = 0xD800 then 971 if storage.item(j2) & 0xF800 = 0xD800 then 972 k1 := low_surrogate_index(i1) 973 k2 := low_surrogate_index(i2) 974 low_tmp := low_surrogate_values.item(k1) 975 low_surrogate_values.put(low_surrogate_values.item(k2), k1) 976 low_surrogate_values.put(low_tmp, k2) 977 low_surrogate_indexes.put(i2, k1) 978 low_surrogate_indexes.put(i1, k2) 979 else 980 low_tmp := low_surrogate_values.item(k1) 981 k1 := low_surrogate_index(i1) 982 k2 := low_surrogate_position(i2) 983 if k2 > k1 + 1 then 984 low_surrogate_indexes.move(k1 + 1, k2 - 1, -1) 985 low_surrogate_values.move(k1 + 1, k2 - 1, -1) 986 k2 := k2 - 1 987 elseif k1 > k2 then 988 low_surrogate_indexes.move(k2, k1 - 1, 1) 989 low_surrogate_values.move(k2, k1 - 1, 1) 990 --else no move 991 end 992 low_surrogate_indexes.put(i1, k2) 993 low_surrogate_values.put(low_tmp, k2) 994 end 995 else 996 if storage.item(j2) & 0xF800 = 0xD800 then 997 low_tmp := low_surrogate_values.item(k2) 998 k1 := low_surrogate_position(i1) 999 k2 := low_surrogate_index(i2) 1000 if k1 > k2 + 1 then 1001 low_surrogate_indexes.move(k2 + 1, k1 - 1, -1) 1002 low_surrogate_values.move(k2 + 1, k1 - 1, -1) 1003 k1 := k1 - 1 1004 elseif k2 > k1 then 1005 low_surrogate_indexes.move(k1, k2 - 1, 1) 1006 low_surrogate_values.move(k1, k2 - 1, 1) 1007 --else no move 1008 end 1009 low_surrogate_indexes.put(i2, k1) 1010 low_surrogate_values.put(low_tmp, k1) 1011 -- else i1 and i2 are not surrogate 1012 end 1013 end 1014 storage.put(storage.item(j2), j1) 1015 storage.put(tmp, j2) 1016 next_generation 1017 ensure 1018 item(i1) = old item(i2) 1019 item(i2) = old item(i1) 1020 end 1021 1022 insert_character (unicode: INTEGER; i: INTEGER) 1023 -- Inserts `unicode' at index `i', shifting characters from 1024 -- position 'i' to `count' rightwards. 1025 require 1026 valid_insertion_index: 1 <= i and i <= count + 1 1027 valid_unicode_value: valid_unicode(unicode) 1028 local 1029 j, k: INTEGER 1030 do 1031 k := low_surrogate_position(i) 1032 from 1033 j := low_surrogate_indexes.upper 1034 until 1035 j < k 1036 loop 1037 low_surrogate_indexes.put(low_surrogate_indexes.item(j) + 1, j) 1038 j := j - 1 1039 end 1040 resize(count + 1) 1041 if count > 1 then 1042 storage.move(i - 1, count - 2, 1) 1043 storage.put(0, i - 1) 1044 end 1045 put(unicode, i) 1046 ensure 1047 item(i) = unicode 1048 end 1049 1050 shrink (min_index, max_index: INTEGER) 1051 -- Keep only the slice [`min_index' .. `max_index'] or nothing 1052 -- when the slice is empty. 1053 require 1054 1 <= min_index 1055 max_index <= count 1056 min_index <= max_index + 1 1057 local 1058 i, j: INTEGER 1059 do 1060 if max_index < min_index then 1061 count := 0 1062 low_surrogate_indexes.make(0) 1063 low_surrogate_values.make(0) 1064 elseif min_index = 1 then 1065 count := max_index 1066 i := low_surrogate_position(count) 1067 if i <= low_surrogate_indexes.upper then 1068 if low_surrogate_indexes.item(i) = max_index then 1069 i := i + 1 1070 end 1071 end 1072 low_surrogate_indexes.resize(i) 1073 low_surrogate_values.resize(i) 1074 else 1075 storage.slice_copy(0, storage, min_index - 1, max_index - 1) 1076 from 1077 i := low_surrogate_position(min_index) 1078 until 1079 i > low_surrogate_indexes.upper or else low_surrogate_indexes.item(i) > max_index 1080 loop 1081 low_surrogate_indexes.put(low_surrogate_indexes.item(i) - min_index + 1, j) 1082 low_surrogate_values.put(low_surrogate_values.item(i), j) 1083 j := j + 1 1084 i := i + 1 1085 end 1086 low_surrogate_indexes.resize(j) 1087 low_surrogate_values.resize(j) 1088 count := max_index - min_index + 1 1089 end 1090 next_generation 1091 ensure 1092 count = max_index - min_index + 1 1093 end 1094 1095 remove (i: INTEGER) 1096 -- Remove character at position `i'. 1097 -- 1098 -- See also `remove_head', `remove_between', `remove_suffix', `remove_prefix'. 1099 require 1100 valid_removal_index: valid_index(i) 1101 do 1102 remove_between(i, i) 1103 ensure 1104 count = old count - 1 1105 end 1106 1107 add_first, precede (unicode: INTEGER) 1108 -- Add `unicode' at first position. 1109 -- 1110 -- See also `add_last'. 1111 require 1112 valid_unicode_value: valid_unicode(unicode) 1113 local 1114 i: INTEGER 1115 do 1116 from 1117 i := low_surrogate_indexes.upper 1118 until 1119 i < 0 1120 loop 1121 low_surrogate_indexes.put(low_surrogate_indexes.item(i) + 1, i) 1122 i := i - 1 1123 end 1124 resize(count + 1) 1125 if count > 1 then 1126 storage.move(0, count - 2, 1) 1127 storage.put(0, 0) 1128 end 1129 put(unicode, 1) 1130 ensure 1131 count = 1 + old count 1132 item(1) = unicode 1133 end 1134 1135 add_last, append_character, extend (unicode: INTEGER) 1136 -- Append `unicode' to string. 1137 -- 1138 -- See also `add_first'. 1139 require 1140 valid_unicode_value: valid_unicode(unicode) 1141 local 1142 new_capacity: INTEGER 1143 do 1144 if capacity > count then 1145 elseif capacity = 0 then 1146 new_capacity := 32 1147 storage := storage.calloc(new_capacity) 1148 capacity := new_capacity 1149 else 1150 new_capacity := 2 * capacity 1151 storage := storage.realloc(capacity, new_capacity) 1152 capacity := new_capacity 1153 end 1154 if unicode >= 65536 then 1155 -- stored as high and low surrogate 1156 low_surrogate_indexes.add_last(count) 1157 low_surrogate_values.add_last((unicode & 0x000003FF).to_integer_16) 1158 storage.put((unicode #// 1024 - 64).low_16, count) 1159 else 1160 storage.put(unicode.low_16, count) 1161 end 1162 count := count + 1 1163 next_generation 1164 ensure 1165 count = 1 + old count 1166 item(count) = unicode 1167 end 1168 1169 to_lower 1170 -- Convert all characters to lower case. 1171 -- 1172 -- See also `to_upper', `as_lower', `as_upper'. 1173 do 1174 not_yet_implemented 1175 end 1176 1177 to_upper 1178 -- Convert all characters to upper case. 1179 -- 1180 -- See also `to_lower', `as_upper', `as_lower'. 1181 do 1182 not_yet_implemented 1183 end 1184 1185 as_lower: like Current 1186 -- New object with all letters in lower case. 1187 -- 1188 -- See also `as_upper', `to_lower', `to_upper'. 1189 do 1190 create Result.copy(Current) 1191 Result.to_lower 1192 end 1193 1194 as_upper: like Current 1195 -- New object with all letters in upper case. 1196 -- 1197 -- See also `as_lower', `to_upper', `to_lower'. 1198 do 1199 create Result.copy(Current) 1200 Result.to_upper 1201 end 1202 1203 keep_head (n: INTEGER) 1204 -- Remove all characters except for the first `n'. 1205 -- Do nothing if `n' >= `count'. 1206 -- 1207 -- See also `keep_tail', `remove_head', `remove_tail'. 1208 require 1209 n_non_negative: n >= 0 1210 do 1211 if n < count then 1212 remove_tail(count - n) 1213 end 1214 ensure 1215 count = n.min(old count) 1216 end 1217 1218 keep_tail (n: INTEGER) 1219 -- Remove all characters except for the last `n'. 1220 -- Do nothing if `n' >= `count'. 1221 -- 1222 -- See also `keep_head', `remove_tail', `remove_head'. 1223 require 1224 n_non_negative: n >= 0 1225 do 1226 if n < count then 1227 remove_head(count - n) 1228 end 1229 ensure 1230 count = n.min(old count) 1231 end 1232 1233 remove_first 1234 -- Remove the `first' item. 1235 -- 1236 -- See also `remove_head', `remove_last', `remove'. 1237 require 1238 not is_empty 1239 do 1240 --*** May be improved? 1241 remove_between(1, 1) 1242 ensure 1243 count = old count - 1 1244 end 1245 1246 remove_head (n: INTEGER) 1247 -- Remove `n' first characters. If `n' >= `count', remove all. 1248 -- 1249 -- See also `remove_tail', `remove', `remove_the_first'. 1250 require 1251 n_non_negative: n >= 0 1252 do 1253 if n > count then 1254 count := 0 1255 low_surrogate_indexes.make(0) 1256 low_surrogate_values.make(0) 1257 else 1258 if n > 0 then 1259 remove_between(1, n) 1260 end 1261 end 1262 next_generation 1263 ensure 1264 count = (old count - n).max(0) 1265 end 1266 1267 remove_last 1268 -- Remove the `last' item. 1269 -- 1270 -- See also `remove_tail', `remove_first', `remove'. 1271 require 1272 not is_empty 1273 do 1274 --*** May be improved 1275 remove_tail(1) 1276 ensure 1277 count = old count - 1 1278 end 1279 1280 remove_tail (n: INTEGER) 1281 -- Remove `n' last characters. If `n' >= `count', remove all. 1282 -- 1283 -- See also `remove_head', `remove', `remove_the_last'. 1284 require 1285 n_non_negative: n >= 0 1286 local 1287 i: INTEGER 1288 do 1289 if n > count then 1290 count := 0 1291 low_surrogate_indexes.make(0) 1292 low_surrogate_values.make(0) 1293 else 1294 count := count - n 1295 i := low_surrogate_position(count + 1) 1296 low_surrogate_indexes.resize(i) 1297 low_surrogate_values.resize(i) 1298 end 1299 next_generation 1300 ensure 1301 count = (old count - n).max(0) 1302 end 1303 1304 remove_substring, remove_between (start_index, end_index: INTEGER) 1305 -- Remove all characters from `strt_index' to `end_index' inclusive. 1306 require 1307 valid_start_index: 1 <= start_index 1308 valid_end_index: end_index <= count 1309 meaningful_interval: start_index <= end_index + 1 1310 local 1311 i, k, len: INTEGER 1312 do 1313 len := end_index - start_index + 1 1314 if len > 0 then 1315 from 1316 i := low_surrogate_position(start_index) 1317 k := low_surrogate_position(end_index + 1) 1318 until 1319 k > low_surrogate_indexes.upper 1320 loop 1321 low_surrogate_indexes.put(low_surrogate_indexes.item(k) - len, i) 1322 low_surrogate_values.put(low_surrogate_values.item(k), i) 1323 k := k + 1 1324 i := i + 1 1325 end 1326 low_surrogate_indexes.resize(i) 1327 low_surrogate_values.resize(i) 1328 storage.slice_copy(start_index - 1, storage, end_index, count - 1) 1329 count := count - len 1330 end 1331 next_generation 1332 ensure 1333 count = old count - (end_index - start_index + 1) 1334 end 1335 1336 remove_suffix (s: UNICODE_STRING) 1337 -- Remove the suffix `s' of current string. 1338 -- 1339 -- See also `remove_prefix', `remove_tail', `remove'. 1340 require 1341 has_suffix(s) 1342 do 1343 not_yet_implemented 1344 -- remove_last(s.count); equal sequence may have different size 1345 ensure 1346 (old Current.twin).is_equal(Current + old s.twin) 1347 end 1348 1349 remove_prefix (s: UNICODE_STRING) 1350 -- Remove the prefix `s' of current string. 1351 -- 1352 -- See also `remove_suffix', `remove_head', `remove'. 1353 require 1354 has_prefix(s) 1355 do 1356 not_yet_implemented 1357 -- remove_head(s.count); equal sequence may have different size 1358 ensure 1359 (old Current.twin).is_equal(old s.twin + Current) 1360 end 1361 1362 left_adjust 1363 -- Remove leading blanks. 1364 -- 1365 -- See also `remove_head', `first'. 1366 local 1367 i: INTEGER 1368 do 1369 from 1370 i := 1 1371 until 1372 i > count or else not is_space(item(i -- not_yet_implemented -- handle combining characters 1373 )) 1374 loop 1375 i := i + 1 1376 end 1377 remove_head(i - 1) 1378 ensure 1379 -- not_yet_implemented -- handle combining characters 1380 stripped: is_empty or else not is_space(first) 1381 end 1382 1383 right_adjust 1384 -- Remove trailing blanks. 1385 -- 1386 -- See also `remove_tail', `last'. 1387 local 1388 i: INTEGER 1389 do 1390 from 1391 until 1392 count = 0 or else not is_space(item(count -- not_yet_implemented -- handle combining characters 1393 )) 1394 loop 1395 count := count - 1 1396 end 1397 i := low_surrogate_position(count + 1) 1398 low_surrogate_indexes.resize(i) 1399 low_surrogate_values.resize(i) 1400 next_generation 1401 ensure 1402 -- not_yet_implemented -- handle combining characters 1403 stripped: is_empty or else not is_space(last) 1404 end 1405 1406feature {ANY} -- Printing: 1407 out_in_tagged_out_memory 1408 do 1409 utf8_encode_in(tagged_out_memory) 1410 end 1411 1412 fill_tagged_out_memory 1413 do 1414 tagged_out_memory.append(once "count: ") 1415 count.append_in(tagged_out_memory) 1416 tagged_out_memory.append(once "capacity: ") 1417 capacity.append_in(tagged_out_memory) 1418 tagged_out_memory.append(once "storage: %"") 1419 utf8_encode_in(tagged_out_memory) 1420 tagged_out_memory.append_character('%"') 1421 end 1422 1423feature {ANY} -- Other features: 1424 first: INTEGER 1425 -- Access to the very `first' character. 1426 -- 1427 -- See also `last', `item'. 1428 local 1429 n: INTEGER 1430 do 1431 n := storage.item(0) 1432 if n & 0x0000F800 = 0x0000D800 then 1433 check 1434 low_surrogate_indexes.item(0) = 1 1435 end 1436 Result := n & 0x000007FF + 64 1437 Result := Result * 1024 + low_surrogate_values.item(0) 1438 else 1439 Result := n & 0x0000FFFF 1440 end 1441 end 1442 1443 last: INTEGER 1444 -- Access to the very `last' character. 1445 -- 1446 -- See also `first', `item'. 1447 local 1448 n: INTEGER 1449 do 1450 n := storage.item(count - 1) 1451 if n & 0x0000F800 = 0x0000D800 then 1452 Result := n & 0x000007FF + 64 1453 Result := Result * 1024 + low_surrogate_value(count) 1454 else 1455 Result := n & 0x0000FFFF 1456 end 1457 end 1458 1459 substring (start_index, end_index: INTEGER): like Current 1460 -- New string consisting of items [`start_index'.. `end_index']. 1461 -- 1462 -- See also `substring_index' and `copy_substring' to save memory. 1463 require 1464 valid_start_index: 1 <= start_index 1465 valid_end_index: end_index <= count 1466 meaningful_interval: start_index <= end_index + 1 1467 local 1468 i: INTEGER; c: like storage; lsi: FAST_ARRAY[INTEGER]; lsv: FAST_ARRAY[INTEGER_16] 1469 do 1470 create Result.make(end_index - start_index + 1) 1471 Result.set_count(end_index - start_index + 1) 1472 c := Result.storage 1473 lsi := Result.low_surrogate_indexes 1474 lsv := Result.low_surrogate_values 1475 c.slice_copy(0, storage, start_index - 1, end_index - 1) 1476 from 1477 i := low_surrogate_position(start_index) 1478 until 1479 i > low_surrogate_indexes.upper or else low_surrogate_indexes.item(i) > end_index 1480 loop 1481 lsi.add_last(low_surrogate_indexes.item(i) - start_index) 1482 lsv.add_last(low_surrogate_values.item(i)) 1483 i := i + 1 1484 end 1485 ensure 1486 substring_count: Result.count = end_index - start_index + 1 1487 end 1488 1489 extend_multiple (unicode: INTEGER; n: INTEGER) 1490 -- Extend Current with `n' times character `unicode'. 1491 require 1492 n >= 0 1493 valid_unicode_value: valid_unicode(unicode) 1494 local 1495 i: INTEGER 1496 do 1497 from 1498 i := n 1499 until 1500 i = 0 1501 loop 1502 append_character(unicode) 1503 i := i - 1 1504 end 1505 ensure 1506 count = n + old count 1507 end 1508 1509 precede_multiple (unicode: INTEGER; n: INTEGER) 1510 -- Prepend `n' times character `unicode' to Current. 1511 require 1512 n >= 0 1513 valid_unicode_value: valid_unicode(unicode) 1514 local 1515 i: INTEGER 1516 do 1517 if n > 0 then 1518 if count = 0 then 1519 extend_multiple(unicode, n) 1520 else 1521 --|*** May be implemented in a more efficient way... 1522 from 1523 i := n 1524 until 1525 i = 0 1526 loop 1527 precede(unicode) 1528 i := i - 1 1529 end 1530 end 1531 end 1532 ensure 1533 count = n + old count 1534 end 1535 1536 extend_to_count (unicode: INTEGER; needed_count: INTEGER) 1537 -- Extend Current with `unicode' until `needed_count' is reached. 1538 -- Do nothing if `needed_count' is already greater or equal 1539 -- to `count'. 1540 require 1541 needed_count >= 0 1542 valid_unicode_value: valid_unicode(unicode) 1543 do 1544 if needed_count > count then 1545 extend_multiple(unicode, needed_count - count) 1546 end 1547 ensure 1548 count >= needed_count 1549 end 1550 1551 precede_to_count (unicode: INTEGER; needed_count: INTEGER) 1552 -- Prepend `unicode' to Current until `needed_count' is reached. 1553 -- Do nothing if `needed_count' is already greater or equal 1554 -- to `count'. 1555 require 1556 needed_count >= 0 1557 valid_unicode_value: valid_unicode(unicode) 1558 do 1559 if needed_count > count then 1560 precede_multiple(unicode, needed_count - count) 1561 end 1562 ensure 1563 count >= needed_count 1564 end 1565 1566 reverse 1567 -- Reverse the string. 1568 local 1569 i1, i2: INTEGER 1570 do 1571 not_yet_implemented 1572 --|*** reverse grapheme 1573 from 1574 i1 := 1 1575 i2 := count 1576 until 1577 i1 >= i2 1578 loop 1579 swap(i1, i2) 1580 i1 := i1 + 1 1581 i2 := i2 - 1 1582 end 1583 end 1584 1585 remove_all_occurrences (unicode: INTEGER) 1586 -- Remove all occurrences of `unicode'. 1587 -- 1588 -- See also `occurrences', `remove'. 1589 require 1590 valid_unicode_value: valid_unicode(unicode) 1591 local 1592 i: INTEGER 1593 do 1594 --|*** May be implemented in a more efficient way... 1595 from 1596 i := index_of(unicode, 1) 1597 until 1598 i = 0 1599 loop 1600 remove(i) 1601 i := index_of(unicode, i) 1602 end 1603 ensure 1604 count = old count - old occurrences(unicode) 1605 end 1606 1607 substring_index (other: UNICODE_STRING; start_index: INTEGER): INTEGER 1608 -- Position of first occurrence of `other' at or after `start', 0 if none. 1609 -- 1610 -- See also `substring', `first_substring_index'. 1611 require 1612 other_not_void: other /= Void 1613 valid_start_index: start_index >= 1 and start_index <= count + 1 1614 do 1615 not_yet_implemented 1616 end 1617 1618 first_substring_index (other: UNICODE_STRING): INTEGER 1619 -- Position of first occurrence of `other' at or after 1, 0 if none. 1620 -- 1621 -- See also `substring_index'. 1622 require 1623 other_not_void: other /= Void 1624 do 1625 Result := substring_index(other, 1) 1626 ensure 1627 definition: Result = substring_index(other, 1) 1628 end 1629 1630feature {ANY} -- Splitting a STRING: 1631 split: ARRAY[UNICODE_STRING] 1632 -- Split the string into an array of words. Uses `is_separator' 1633 -- to find words. Gives Void or a non empty array. 1634 -- 1635 -- See also `split_in'. 1636 do 1637 if count > 0 then 1638 split_buffer.clear_count 1639 split_in(split_buffer) 1640 if not split_buffer.is_empty then 1641 Result := split_buffer.twin 1642 end 1643 end 1644 ensure 1645 Result /= Void implies not Result.is_empty 1646 end 1647 1648 split_in (words: COLLECTION[UNICODE_STRING]) 1649 -- Same jobs as `split' but result is appended in `words'. 1650 -- 1651 -- See also `split'. 1652 require 1653 words /= Void 1654 local 1655 state, i: INTEGER; unicode: INTEGER 1656 do 1657 -- state = 0: waiting next word. 1658 -- state = 1: inside a new word. 1659 -- not_yet_implemented --|*** handle combining characters 1660 if count > 0 then 1661 from 1662 i := 1 1663 until 1664 i > count 1665 loop 1666 unicode := item(i) 1667 if state = 0 then 1668 if not is_separator(unicode) then 1669 string_buffer.clear_count 1670 string_buffer.append_character(unicode) 1671 state := 1 1672 end 1673 else 1674 if not is_separator(unicode) then 1675 string_buffer.append_character(unicode) 1676 else 1677 words.add_last(string_buffer.twin) 1678 state := 0 1679 end 1680 end 1681 i := i + 1 1682 end 1683 if state = 1 then 1684 words.add_last(string_buffer.twin) 1685 end 1686 end 1687 ensure 1688 words.count >= old words.count 1689 end 1690 1691feature {ANY} -- Other features: 1692 extend_unless (unicode: INTEGER) 1693 -- Extend `Current' (using `extend') with `unicode' unless 1694 -- unicode `ch' is already the `last' character. 1695 require 1696 valid_unicode_value: valid_unicode(unicode) 1697 do 1698 if count = 0 or else item(count) /= unicode then 1699 append_character(unicode) 1700 end 1701 ensure 1702 last = unicode 1703 count >= old count 1704 end 1705 1706 new_iterator: ITERATOR[INTEGER] 1707 do 1708 create {ITERATOR_ON_UNICODE_STRING} Result.make(Current) 1709 end 1710 1711 valid_unicode (unicode: INTEGER): BOOLEAN 1712 do 1713 Result := unicode.in_range(0, 0x0010FFFF) and then not unicode.in_range(0x0000D800, 0x0000DFFF) and then unicode /= 0x0000FFFE and then unicode /= 0x0000FFFF 1714 -- surrogates 1715 -- reverse BOM 1716 -- not valid unicode value 1717 end 1718 1719 is_space (unicode: INTEGER): BOOLEAN 1720 do 1721 -- not_yet_implemented; should handle combining characters 1722 Result := unicode = ' '.code 1723 end 1724 1725 is_separator (unicode: INTEGER): BOOLEAN 1726 do 1727 -- not_yet_implemented; should handle combining characters 1728 Result := unicode = ' '.code 1729 end 1730 1731 is_combining (unicode: INTEGER): BOOLEAN 1732 do 1733 not_yet_implemented 1734 -- 0x0300 -> 0x036f 1735 -- 0x20d0 -> 0x20ff 1736 -- 0xfe20 -> 0xfe2f 1737 end 1738 1739feature {UNICODE_STRING, UNICODE_STRING_HANDLER} 1740 low_surrogate_indexes: FAST_ARRAY[INTEGER] 1741 -- user indexes (starting at 1) 1742 1743 low_surrogate_values: FAST_ARRAY[INTEGER_16] 1744 -- low surrogate value is stored without 0xDC00 part and 1745 -- endianness dependant ! 1746 1747 set_count (new_count: INTEGER) 1748 require 1749 new_count <= capacity 1750 do 1751 count := new_count 1752 end 1753 1754feature {} 1755 string_buffer: UNICODE_STRING 1756 -- Private, temporary once buffer. 1757 once 1758 create Result.make(256) 1759 end 1760 1761 tmp_buffer: STRING 1762 -- Private, temporary once buffer. 1763 once 1764 create Result.make(256) 1765 end 1766 1767 split_buffer: ARRAY[UNICODE_STRING] 1768 once 1769 create Result.with_capacity(4, 1) 1770 end 1771 1772 low_surrogate_value (index: INTEGER): INTEGER_16 1773 require 1774 storage.item(index) & 0xF800 = 0xD800 1775 do 1776 Result := low_surrogate_values.item(low_surrogate_index(index)) 1777 ensure 1778 Result.in_range(0, 1023) 1779 end 1780 1781 low_surrogate_index (index: INTEGER): INTEGER 1782 require 1783 low_surrogate_indexes.has(index) 1784 do 1785 --|*** Should use dichotomic search 1786 Result := low_surrogate_indexes.fast_first_index_of(index) 1787 ensure 1788 low_surrogate_values.valid_index(Result) 1789 end 1790 1791 low_surrogate_position (index: INTEGER): INTEGER 1792 -- return position to use in low_surrogate* arrays relative to 1793 -- character at `index' in the string (return the good answer 1794 -- if the corresponding character is not surrogate) 1795 do 1796 -- Should use dichotomic search 1797 from 1798 until 1799 Result > low_surrogate_indexes.upper or else low_surrogate_indexes.item(Result) >= index 1800 loop 1801 Result := Result + 1 1802 end 1803 ensure 1804 low_surrogate_indexes.is_empty implies Result = 0 1805 Result <= low_surrogate_indexes.upper + 1 1806 Result >= low_surrogate_indexes.lower 1807 Result > low_surrogate_indexes.lower implies low_surrogate_indexes.item(Result - 1) < index 1808 Result <= low_surrogate_indexes.upper implies low_surrogate_indexes.item(Result + 1) >= index 1809 end 1810 1811 valid_surrogates: BOOLEAN 1812 local 1813 i, j: INTEGER 1814 do 1815 from 1816 Result := True 1817 until 1818 i >= count 1819 loop 1820 if storage.item(i) & 0xF800 = 0xD800 then 1821 if low_surrogate_indexes.item(j) /= i + 1 then 1822 Result := False 1823 end 1824 if storage.item(i) <= 0xDC00 then 1825 -- negative! 1826 Result := False 1827 end 1828 j := j + 1 1829 end 1830 i := i + 1 1831 end 1832 if low_surrogate_indexes.count /= j then 1833 Result := False 1834 end 1835 end 1836 1837feature {} 1838 manifest_initialize (c: like capacity; s: like storage; ls_cap: INTEGER; lsv: NATIVE_ARRAY[INTEGER_16] 1839 lsi: NATIVE_ARRAY[INTEGER]) 1840 -- This function is a compiler-hook automatically called when 1841 -- a manifest unicode string (i.e. U"foo") is used in the Eiffel 1842 -- source code. 1843 local 1844 i: INTEGER 1845 do 1846 if c > 0 then 1847 storage := storage.calloc(c) 1848 storage.copy_from(s, c - 1) 1849 end 1850 capacity := c 1851 count := c 1852 from 1853 create low_surrogate_indexes.make(ls_cap) 1854 create low_surrogate_values.make(ls_cap) 1855 until 1856 i >= ls_cap 1857 loop 1858 --|*** TODO: array copy may be improved using 1859 --|NATIVE_ARRAY.copy_from. Need to force new upper value 1860 --|in FAST_ARRAY. 1861 low_surrogate_indexes.add_last(lsi.item(i)) 1862 low_surrogate_values.add_last(lsv.item(i)) 1863 i := i + 1 1864 end 1865 end 1866 1867feature {} 1868 debug_utf8: STRING 1869 1870 set_debug_utf8 1871 do 1872 if debug_utf8 = Void then 1873 debug_utf8 := to_utf8 1874 else 1875 debug_utf8.copy(as_utf8) 1876 end 1877 end 1878 1879 next_generation 1880 do 1881 Precursor 1882 debug("UNICODE_STRING") 1883 set_debug_utf8 1884 end 1885 end 1886 1887feature {RECYCLING_POOL} 1888 recycle 1889 do 1890 clear_count 1891 end 1892 1893invariant 1894 0 <= count 1895 count <= capacity 1896 capacity > 0 implies storage.is_not_null 1897 low_surrogate_values.count = low_surrogate_indexes.count 1898 valid_surrogates 1899 1900end -- class UNICODE_STRING 1901-- 1902-- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file. 1903-- 1904-- Permission is hereby granted, free of charge, to any person obtaining a copy 1905-- of this software and associated documentation files (the "Software"), to deal 1906-- in the Software without restriction, including without limitation the rights 1907-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1908-- copies of the Software, and to permit persons to whom the Software is 1909-- furnished to do so, subject to the following conditions: 1910-- 1911-- The above copyright notice and this permission notice shall be included in 1912-- all copies or substantial portions of the Software. 1913-- 1914-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1915-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1916-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1917-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1918-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1919-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 1920-- THE SOFTWARE.