/extlibs/SFML/include/SFML/System/Utf.inl
C++ Header | 671 lines | 445 code | 124 blank | 102 comment | 67 complexity | 02d108254fa111f7422014451bae706f MD5 | raw file
1//////////////////////////////////////////////////////////// 2// 3// SFML - Simple and Fast Multimedia Library 4// Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) 5// 6// This software is provided 'as-is', without any express or implied warranty. 7// In no event will the authors be held liable for any damages arising from the use of this software. 8// 9// Permission is granted to anyone to use this software for any purpose, 10// including commercial applications, and to alter it and redistribute it freely, 11// subject to the following restrictions: 12// 13// 1. The origin of this software must not be misrepresented; 14// you must not claim that you wrote the original software. 15// If you use this software in a product, an acknowledgment 16// in the product documentation would be appreciated but is not required. 17// 18// 2. Altered source versions must be plainly marked as such, 19// and must not be misrepresented as being the original software. 20// 21// 3. This notice may not be removed or altered from any source distribution. 22// 23//////////////////////////////////////////////////////////// 24 25 26//////////////////////////////////////////////////////////// 27template <typename In> 28In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 29{ 30 // Some useful precomputed data 31 static const int trailing[256] = 32 { 33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 41 }; 42 static const Uint32 offsets[6] = 43 { 44 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 45 }; 46 47 // Decode the character 48 int trailingBytes = trailing[static_cast<Uint8>(*begin)]; 49 if (begin + trailingBytes < end) 50 { 51 output = 0; 52 switch (trailingBytes) 53 { 54 case 5 : output += static_cast<Uint8>(*begin++); output <<= 6; 55 case 4 : output += static_cast<Uint8>(*begin++); output <<= 6; 56 case 3 : output += static_cast<Uint8>(*begin++); output <<= 6; 57 case 2 : output += static_cast<Uint8>(*begin++); output <<= 6; 58 case 1 : output += static_cast<Uint8>(*begin++); output <<= 6; 59 case 0 : output += static_cast<Uint8>(*begin++); 60 } 61 output -= offsets[trailingBytes]; 62 } 63 else 64 { 65 // Incomplete character 66 begin = end; 67 output = replacement; 68 } 69 70 return begin; 71} 72 73 74//////////////////////////////////////////////////////////// 75template <typename Out> 76Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement) 77{ 78 // Some useful precomputed data 79 static const Uint8 firstBytes[7] = 80 { 81 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 82 }; 83 84 // Encode the character 85 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) 86 { 87 // Invalid character 88 if (replacement) 89 *output++ = replacement; 90 } 91 else 92 { 93 // Valid character 94 95 // Get the number of bytes to write 96 int bytesToWrite = 1; 97 if (input < 0x80) bytesToWrite = 1; 98 else if (input < 0x800) bytesToWrite = 2; 99 else if (input < 0x10000) bytesToWrite = 3; 100 else if (input <= 0x0010FFFF) bytesToWrite = 4; 101 102 // Extract the bytes to write 103 Uint8 bytes[4]; 104 switch (bytesToWrite) 105 { 106 case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 107 case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 108 case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 109 case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytesToWrite]); 110 } 111 112 // Add them to the output 113 const Uint8* currentByte = bytes; 114 switch (bytesToWrite) 115 { 116 case 4 : *output++ = *currentByte++; 117 case 3 : *output++ = *currentByte++; 118 case 2 : *output++ = *currentByte++; 119 case 1 : *output++ = *currentByte++; 120 } 121 } 122 123 return output; 124} 125 126 127//////////////////////////////////////////////////////////// 128template <typename In> 129In Utf<8>::Next(In begin, In end) 130{ 131 Uint32 codepoint; 132 return Decode(begin, end, codepoint); 133} 134 135 136//////////////////////////////////////////////////////////// 137template <typename In> 138std::size_t Utf<8>::Count(In begin, In end) 139{ 140 std::size_t length = 0; 141 while (begin < end) 142 { 143 begin = Next(begin, end); 144 ++length; 145 } 146 147 return length; 148} 149 150 151//////////////////////////////////////////////////////////// 152template <typename In, typename Out> 153Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 154{ 155 while (begin < end) 156 { 157 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 158 output = Encode(codepoint, output); 159 } 160 161 return output; 162} 163 164 165//////////////////////////////////////////////////////////// 166template <typename In, typename Out> 167Out Utf<8>::FromWide(In begin, In end, Out output) 168{ 169 while (begin < end) 170 { 171 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 172 output = Encode(codepoint, output); 173 } 174 175 return output; 176} 177 178 179//////////////////////////////////////////////////////////// 180template <typename In, typename Out> 181Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 182{ 183 while (begin < end) 184 { 185 Uint32 codepoint; 186 begin = Decode(begin, end, codepoint); 187 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 188 } 189 190 return output; 191} 192 193 194//////////////////////////////////////////////////////////// 195template <typename In, typename Out> 196Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement) 197{ 198 while (begin < end) 199 { 200 Uint32 codepoint; 201 begin = Decode(begin, end, codepoint); 202 output = Utf<32>::EncodeWide(codepoint, output, replacement); 203 } 204 205 return output; 206} 207 208 209//////////////////////////////////////////////////////////// 210template <typename In, typename Out> 211Out Utf<8>::ToUtf8(In begin, In end, Out output) 212{ 213 while (begin < end) 214 *output++ = *begin++; 215 216 return output; 217} 218 219 220//////////////////////////////////////////////////////////// 221template <typename In, typename Out> 222Out Utf<8>::ToUtf16(In begin, In end, Out output) 223{ 224 while (begin < end) 225 { 226 Uint32 codepoint; 227 begin = Decode(begin, end, codepoint); 228 output = Utf<16>::Encode(codepoint, output); 229 } 230 231 return output; 232} 233 234 235//////////////////////////////////////////////////////////// 236template <typename In, typename Out> 237Out Utf<8>::ToUtf32(In begin, In end, Out output) 238{ 239 while (begin < end) 240 { 241 Uint32 codepoint; 242 begin = Decode(begin, end, codepoint); 243 *output++ = codepoint; 244 } 245 246 return output; 247} 248 249 250//////////////////////////////////////////////////////////// 251template <typename In> 252In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 253{ 254 Uint16 first = *begin++; 255 256 // If it's a surrogate pair, first convert to a single UTF-32 character 257 if ((first >= 0xD800) && (first <= 0xDBFF)) 258 { 259 if (begin < end) 260 { 261 Uint32 second = *begin++; 262 if ((second >= 0xDC00) && (second <= 0xDFFF)) 263 { 264 // The second element is valid: convert the two elements to a UTF-32 character 265 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000); 266 } 267 else 268 { 269 // Invalid character 270 output = replacement; 271 } 272 } 273 else 274 { 275 // Invalid character 276 begin = end; 277 output = replacement; 278 } 279 } 280 else 281 { 282 // We can make a direct copy 283 output = first; 284 } 285 286 return begin; 287} 288 289 290//////////////////////////////////////////////////////////// 291template <typename Out> 292Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement) 293{ 294 if (input < 0xFFFF) 295 { 296 // The character can be copied directly, we just need to check if it's in the valid range 297 if ((input >= 0xD800) && (input <= 0xDFFF)) 298 { 299 // Invalid character (this range is reserved) 300 if (replacement) 301 *output++ = replacement; 302 } 303 else 304 { 305 // Valid character directly convertible to a single UTF-16 character 306 *output++ = static_cast<Uint16>(input); 307 } 308 } 309 else if (input > 0x0010FFFF) 310 { 311 // Invalid character (greater than the maximum unicode value) 312 if (replacement) 313 *output++ = replacement; 314 } 315 else 316 { 317 // The input character will be converted to two UTF-16 elements 318 input -= 0x0010000; 319 *output++ = static_cast<Uint16>((input >> 10) + 0xD800); 320 *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00); 321 } 322 323 return output; 324} 325 326 327//////////////////////////////////////////////////////////// 328template <typename In> 329In Utf<16>::Next(In begin, In end) 330{ 331 Uint32 codepoint; 332 return Decode(begin, end, codepoint); 333} 334 335 336//////////////////////////////////////////////////////////// 337template <typename In> 338std::size_t Utf<16>::Count(In begin, In end) 339{ 340 std::size_t length = 0; 341 while (begin < end) 342 { 343 begin = Next(begin, end); 344 ++length; 345 } 346 347 return length; 348} 349 350 351//////////////////////////////////////////////////////////// 352template <typename In, typename Out> 353Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 354{ 355 while (begin < end) 356 { 357 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 358 output = Encode(codepoint, output); 359 } 360 361 return output; 362} 363 364 365//////////////////////////////////////////////////////////// 366template <typename In, typename Out> 367Out Utf<16>::FromWide(In begin, In end, Out output) 368{ 369 while (begin < end) 370 { 371 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 372 output = Encode(codepoint, output); 373 } 374 375 return output; 376} 377 378 379//////////////////////////////////////////////////////////// 380template <typename In, typename Out> 381Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 382{ 383 while (begin < end) 384 { 385 Uint32 codepoint; 386 begin = Decode(begin, end, codepoint); 387 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 388 } 389 390 return output; 391} 392 393 394//////////////////////////////////////////////////////////// 395template <typename In, typename Out> 396Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement) 397{ 398 while (begin < end) 399 { 400 Uint32 codepoint; 401 begin = Decode(begin, end, codepoint); 402 output = Utf<32>::EncodeWide(codepoint, output, replacement); 403 } 404 405 return output; 406} 407 408 409//////////////////////////////////////////////////////////// 410template <typename In, typename Out> 411Out Utf<16>::ToUtf8(In begin, In end, Out output) 412{ 413 while (begin < end) 414 { 415 Uint32 codepoint; 416 begin = Decode(begin, end, codepoint); 417 output = Utf<8>::Encode(codepoint, output); 418 } 419 420 return output; 421} 422 423 424//////////////////////////////////////////////////////////// 425template <typename In, typename Out> 426Out Utf<16>::ToUtf16(In begin, In end, Out output) 427{ 428 while (begin < end) 429 *output++ = *begin++; 430 431 return output; 432} 433 434 435//////////////////////////////////////////////////////////// 436template <typename In, typename Out> 437Out Utf<16>::ToUtf32(In begin, In end, Out output) 438{ 439 while (begin < end) 440 { 441 Uint32 codepoint; 442 begin = Decode(begin, end, codepoint); 443 *output++ = codepoint; 444 } 445 446 return output; 447} 448 449 450//////////////////////////////////////////////////////////// 451template <typename In> 452In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32) 453{ 454 output = *begin++; 455 return begin; 456} 457 458 459//////////////////////////////////////////////////////////// 460template <typename Out> 461Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement) 462{ 463 *output++ = input; 464 return output; 465} 466 467 468//////////////////////////////////////////////////////////// 469template <typename In> 470In Utf<32>::Next(In begin, In end) 471{ 472 return ++begin; 473} 474 475 476//////////////////////////////////////////////////////////// 477template <typename In> 478std::size_t Utf<32>::Count(In begin, In end) 479{ 480 return begin - end; 481} 482 483 484//////////////////////////////////////////////////////////// 485template <typename In, typename Out> 486Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 487{ 488 while (begin < end) 489 *output++ = DecodeAnsi(*begin++, locale); 490 491 return output; 492} 493 494 495//////////////////////////////////////////////////////////// 496template <typename In, typename Out> 497Out Utf<32>::FromWide(In begin, In end, Out output) 498{ 499 while (begin < end) 500 *output++ = DecodeWide(*begin++); 501 502 return output; 503} 504 505 506//////////////////////////////////////////////////////////// 507template <typename In, typename Out> 508Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 509{ 510 while (begin < end) 511 output = EncodeAnsi(*begin++, output, replacement, locale); 512 513 return output; 514} 515 516 517//////////////////////////////////////////////////////////// 518template <typename In, typename Out> 519Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement) 520{ 521 while (begin < end) 522 output = EncodeWide(*begin++, output, replacement); 523 524 return output; 525} 526 527 528//////////////////////////////////////////////////////////// 529template <typename In, typename Out> 530Out Utf<32>::ToUtf8(In begin, In end, Out output) 531{ 532 while (begin < end) 533 output = Utf<8>::Encode(*begin++, output); 534 535 return output; 536} 537 538//////////////////////////////////////////////////////////// 539template <typename In, typename Out> 540Out Utf<32>::ToUtf16(In begin, In end, Out output) 541{ 542 while (begin < end) 543 output = Utf<16>::Encode(*begin++, output); 544 545 return output; 546} 547 548 549//////////////////////////////////////////////////////////// 550template <typename In, typename Out> 551Out Utf<32>::ToUtf32(In begin, In end, Out output) 552{ 553 while (begin < end) 554 *output++ = *begin++; 555 556 return output; 557} 558 559 560//////////////////////////////////////////////////////////// 561template <typename In> 562Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale) 563{ 564 // On Windows, gcc's standard library (glibc++) has almost 565 // no support for Unicode stuff. As a consequence, in this 566 // context we can only use the default locale and ignore 567 // the one passed as parameter. 568 569 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 570 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 571 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 572 573 wchar_t character = 0; 574 mbtowc(&character, &input, 1); 575 return static_cast<Uint32>(character); 576 577 #else 578 579 // Get the facet of the locale which deals with character conversion 580 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 581 582 // Use the facet to convert each character of the input string 583 return static_cast<Uint32>(facet.widen(input)); 584 585 #endif 586} 587 588 589//////////////////////////////////////////////////////////// 590template <typename In> 591Uint32 Utf<32>::DecodeWide(In input) 592{ 593 // The encoding of wide characters is not well defined and is left to the system; 594 // however we can safely assume that it is UCS-2 on Windows and 595 // UCS-4 on Unix systems. 596 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, 597 // and UCS-4 *is* UTF-32). 598 599 return input; 600} 601 602 603//////////////////////////////////////////////////////////// 604template <typename Out> 605Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) 606{ 607 // On Windows, gcc's standard library (glibc++) has almost 608 // no support for Unicode stuff. As a consequence, in this 609 // context we can only use the default locale and ignore 610 // the one passed as parameter. 611 612 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 613 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 614 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 615 616 char character = 0; 617 if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0) 618 *output++ = character; 619 else if (replacement) 620 *output++ = replacement; 621 622 return output; 623 624 #else 625 626 // Get the facet of the locale which deals with character conversion 627 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 628 629 // Use the facet to convert each character of the input string 630 *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement); 631 632 return output; 633 634 #endif 635} 636 637 638//////////////////////////////////////////////////////////// 639template <typename Out> 640Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement) 641{ 642 // The encoding of wide characters is not well defined and is left to the system; 643 // however we can safely assume that it is UCS-2 on Windows and 644 // UCS-4 on Unix systems. 645 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). 646 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). 647 648 switch (sizeof(wchar_t)) 649 { 650 case 4: 651 { 652 *output++ = static_cast<wchar_t>(codepoint); 653 break; 654 } 655 656 default: 657 { 658 if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) 659 { 660 *output++ = static_cast<wchar_t>(codepoint); 661 } 662 else if (replacement) 663 { 664 *output++ = replacement; 665 } 666 break; 667 } 668 } 669 670 return output; 671}