/src/wrappers/glib/library/utilities/glib_unicode_manipulation.e
Specman e | 1278 lines | 35 code | 338 blank | 905 comment | 2 complexity | b03c41098c1a40f12d152ad7cc22961f MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1indexing 2 description: "C string Utility Functions -- various C-string-related functions." 3 copyright: "[ 4 Copyright (C) 2007 Paolo Redaelli, Anthony Lenton, 5 Soluciones Informaticas Libres S.A., GLib team 6 7 This library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public License 9 as published by the Free Software Foundation; either version 2.1 of 10 the License, or (at your option) any later version. 11 12 This library is distributed in the hopeOA that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with this library; if not, write to the Free Software 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 20 02110-1301 USA 21 ]" 22 23deferred class GLIB_UNICODE_MANIPULATION 24 25insert 26 ANY undefine is_equal, copy end 27 28feature {} -- Utility functions, inherit them if you need them 29 30-- Unicode Manipulation -- functions operating on Unicode characters and UTF-8 31-- strings. 32 33-- Synopsis 34 35 36-- #include <glib.h> 37 38 39-- typedef gunichar; 40-- typedef gunichar2; 41 42-- gboolean g_unichar_validate (gunichar ch); 43-- gboolean g_unichar_isalnum (gunichar c); 44-- gboolean g_unichar_isalpha (gunichar c); 45-- gboolean g_unichar_iscntrl (gunichar c); 46-- gboolean g_unichar_isdigit (gunichar c); 47-- gboolean g_unichar_isgraph (gunichar c); 48-- gboolean g_unichar_islower (gunichar c); 49-- gboolean g_unichar_isprint (gunichar c); 50-- gboolean g_unichar_ispunct (gunichar c); 51-- gboolean g_unichar_isspace (gunichar c); 52-- gboolean g_unichar_isupper (gunichar c); 53-- gboolean g_unichar_isxdigit (gunichar c); 54-- gboolean g_unichar_istitle (gunichar c); 55-- gboolean g_unichar_isdefined (gunichar c); 56-- gboolean g_unichar_iswide (gunichar c); 57-- gboolean g_unichar_iswide_cjk (gunichar c); 58-- gunichar g_unichar_toupper (gunichar c); 59-- gunichar g_unichar_tolower (gunichar c); 60-- gunichar g_unichar_totitle (gunichar c); 61-- gint g_unichar_digit_value (gunichar c); 62-- gint g_unichar_xdigit_value (gunichar c); 63-- enum GUnicodeType; 64-- GUnicodeType g_unichar_type (gunichar c); 65-- enum GUnicodeBreakType; 66-- GUnicodeBreakType g_unichar_break_type (gunichar c); 67-- void g_unicode_canonical_ordering (gunichar *string, 68-- gsize len); 69-- gunichar* g_unicode_canonical_decomposition 70-- (gunichar ch, 71-- gsize *result_len); 72-- gboolean g_unichar_get_mirror_char (gunichar ch, 73-- gunichar *mirrored_ch); 74 75-- #define g_utf8_next_char (p) 76-- gunichar g_utf8_get_char (const gchar *p); 77-- gunichar g_utf8_get_char_validated (const gchar *p, 78-- gssize max_len); 79-- gchar* g_utf8_offset_to_pointer (const gchar *str, 80-- glong offset); 81-- glong g_utf8_pointer_to_offset (const gchar *str, 82-- const gchar *pos); 83-- gchar* g_utf8_prev_char (const gchar *p); 84-- gchar* g_utf8_find_next_char (const gchar *p, 85-- const gchar *end); 86-- gchar* g_utf8_find_prev_char (const gchar *str, 87-- const gchar *p); 88-- glong g_utf8_strlen (const gchar *p, 89-- gssize max); 90-- gchar* g_utf8_strncpy (gchar *dest, 91-- const gchar *src, 92-- gsize n); 93-- gchar* g_utf8_strchr (const gchar *p, 94-- gssize len, 95-- gunichar c); 96-- gchar* g_utf8_strrchr (const gchar *p, 97-- gssize len, 98-- gunichar c); 99-- gchar* g_utf8_strreverse (const gchar *str, 100-- gssize len); 101-- gboolean g_utf8_validate (const gchar *str, 102-- gssize max_len, 103-- const gchar **end); 104 105-- gchar* g_utf8_strup (const gchar *str, 106-- gssize len); 107-- gchar* g_utf8_strdown (const gchar *str, 108-- gssize len); 109-- gchar* g_utf8_casefold (const gchar *str, 110-- gssize len); 111-- gchar* g_utf8_normalize (const gchar *str, 112-- gssize len, 113-- GNormalizeMode mode); 114-- enum GNormalizeMode; 115-- gint g_utf8_collate (const gchar *str1, 116-- const gchar *str2); 117-- gchar* g_utf8_collate_key (const gchar *str, 118-- gssize len); 119-- gchar* g_utf8_collate_key_for_filename (const gchar *str, 120-- gssize len); 121 122-- gunichar2* g_utf8_to_utf16 (const gchar *str, 123-- glong len, 124-- glong *items_read, 125-- glong *items_written, 126-- GError **error); 127-- gunichar* g_utf8_to_ucs4 (const gchar *str, 128-- glong len, 129-- glong *items_read, 130-- glong *items_written, 131-- GError **error); 132-- gunichar* g_utf8_to_ucs4_fast (const gchar *str, 133-- glong len, 134-- glong *items_written); 135-- gunichar* g_utf16_to_ucs4 (const gunichar2 *str, 136-- glong len, 137-- glong *items_read, 138-- glong *items_written, 139-- GError **error); 140-- gchar* g_utf16_to_utf8 (const gunichar2 *str, 141-- glong len, 142-- glong *items_read, 143-- glong *items_written, 144-- GError **error); 145-- gunichar2* g_ucs4_to_utf16 (const gunichar *str, 146-- glong len, 147-- glong *items_read, 148-- glong *items_written, 149-- GError **error); 150-- gchar* g_ucs4_to_utf8 (const gunichar *str, 151-- glong len, 152-- glong *items_read, 153-- glong *items_written, 154-- GError **error); 155-- gint g_unichar_to_utf8 (gunichar c, 156-- gchar *outbuf); 157 158-- Description 159 160-- This section describes a number of functions for dealing with Unicode characters 161-- and strings. There are analogues of the traditional ctype.h character 162-- classification and case conversion functions, UTF-8 analogues of some string 163-- utility functions, functions to perform normalization, case conversion and 164-- collation on UTF-8 strings and finally functions to convert between the UTF-8, 165-- UTF-16 and UCS-4 encodings of Unicode. 166 167-- The implementations of the Unicode functions in GLib are based on the Unicode 168-- Character Data tables, which are available from www.unicode.org. GLib 2.8 169-- supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode 170-- 5.0. 171 172-- Details 173 174-- gunichar 175 176-- typedef guint32 gunichar; 177 178-- A type which can hold any UCS-4 character code. 179 180-- --------------------------------------------------------------------------------- 181 182-- gunichar2 183 184-- typedef guint16 gunichar2; 185 186-- A type which can hold any UTF-16 code point^[3]. 187 188-- --------------------------------------------------------------------------------- 189 190-- g_unichar_validate () 191 192-- gboolean g_unichar_validate (gunichar ch); 193 194-- Checks whether ch is a valid Unicode character. Some possible integer values of 195-- ch will not be valid. 0 is considered a valid character, though it's normally a 196-- string terminator. 197 198-- ch : a Unicode character 199-- Returns : TRUE if ch is a valid Unicode character 200 201-- --------------------------------------------------------------------------------- 202 203-- g_unichar_isalnum () 204 205-- gboolean g_unichar_isalnum (gunichar c); 206 207-- Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a 208-- character value with g_utf8_get_char(). 209 210-- c : a Unicode character 211-- Returns : TRUE if c is an alphanumeric character 212 213-- --------------------------------------------------------------------------------- 214 215-- g_unichar_isalpha () 216 217-- gboolean g_unichar_isalpha (gunichar c); 218 219-- Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8 220-- text, obtain a character value with g_utf8_get_char(). 221 222-- c : a Unicode character 223-- Returns : TRUE if c is an alphabetic character 224 225-- --------------------------------------------------------------------------------- 226 227-- g_unichar_iscntrl () 228 229-- gboolean g_unichar_iscntrl (gunichar c); 230 231-- Determines whether a character is a control character. Given some UTF-8 text, 232-- obtain a character value with g_utf8_get_char(). 233 234-- c : a Unicode character 235-- Returns : TRUE if c is a control character 236 237-- --------------------------------------------------------------------------------- 238 239-- g_unichar_isdigit () 240 241-- gboolean g_unichar_isdigit (gunichar c); 242 243-- Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9 244-- and also digits in other languages/scripts. Given some UTF-8 text, obtain a 245-- character value with g_utf8_get_char(). 246 247-- c : a Unicode character 248-- Returns : TRUE if c is a digit 249 250-- --------------------------------------------------------------------------------- 251 252-- g_unichar_isgraph () 253 254-- gboolean g_unichar_isgraph (gunichar c); 255 256-- Determines whether a character is printable and not a space (returns FALSE for 257-- control characters, format characters, and spaces). g_unichar_isprint() is 258-- similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character 259-- value with g_utf8_get_char(). 260 261-- c : a Unicode character 262-- Returns : TRUE if c is printable unless it's a space 263 264-- --------------------------------------------------------------------------------- 265 266-- g_unichar_islower () 267 268-- gboolean g_unichar_islower (gunichar c); 269 270-- Determines whether a character is a lowercase letter. Given some UTF-8 text, 271-- obtain a character value with g_utf8_get_char(). 272 273-- c : a Unicode character 274-- Returns : TRUE if c is a lowercase letter 275 276-- --------------------------------------------------------------------------------- 277 278-- g_unichar_isprint () 279 280-- gboolean g_unichar_isprint (gunichar c); 281 282-- Determines whether a character is printable. Unlike g_unichar_isgraph(), returns 283-- TRUE for spaces. Given some UTF-8 text, obtain a character value with 284-- g_utf8_get_char(). 285 286-- c : a Unicode character 287-- Returns : TRUE if c is printable 288 289-- --------------------------------------------------------------------------------- 290 291-- g_unichar_ispunct () 292 293-- gboolean g_unichar_ispunct (gunichar c); 294 295-- Determines whether a character is punctuation or a symbol. Given some UTF-8 text, 296-- obtain a character value with g_utf8_get_char(). 297 298-- c : a Unicode character 299-- Returns : TRUE if c is a punctuation or symbol character 300 301-- --------------------------------------------------------------------------------- 302 303-- g_unichar_isspace () 304 305-- gboolean g_unichar_isspace (gunichar c); 306 307-- Determines whether a character is a space, tab, or line separator (newline, 308-- carriage return, etc.). Given some UTF-8 text, obtain a character value with 309-- g_utf8_get_char(). 310 311-- (Note: don't use this to do word breaking; you have to use Pango or equivalent to 312-- get word breaking right, the algorithm is fairly complex.) 313 314-- c : a Unicode character 315-- Returns : TRUE if c is a space character 316 317-- --------------------------------------------------------------------------------- 318 319-- g_unichar_isupper () 320 321-- gboolean g_unichar_isupper (gunichar c); 322 323-- Determines if a character is uppercase. 324 325-- c : a Unicode character 326-- Returns : TRUE if c is an uppercase character 327 328-- --------------------------------------------------------------------------------- 329 330-- g_unichar_isxdigit () 331 332-- gboolean g_unichar_isxdigit (gunichar c); 333 334-- Determines if a character is a hexidecimal digit. 335 336-- c : a Unicode character. 337-- Returns : TRUE if the character is a hexadecimal digit 338 339-- --------------------------------------------------------------------------------- 340 341-- g_unichar_istitle () 342 343-- gboolean g_unichar_istitle (gunichar c); 344 345-- Determines if a character is titlecase. Some characters in Unicode which are 346-- composites, such as the DZ digraph have three case variants instead of just two. 347-- The titlecase form is used at the beginning of a word where only the first letter 348-- is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL 349-- LETTTER D WITH SMALL LETTER Z. 350 351-- c : a Unicode character 352-- Returns : TRUE if the character is titlecase 353 354-- --------------------------------------------------------------------------------- 355 356-- g_unichar_isdefined () 357 358-- gboolean g_unichar_isdefined (gunichar c); 359 360-- Determines if a given character is assigned in the Unicode standard. 361 362-- c : a Unicode character 363-- Returns : TRUE if the character has an assigned value 364 365-- --------------------------------------------------------------------------------- 366 367-- g_unichar_iswide () 368 369-- gboolean g_unichar_iswide (gunichar c); 370 371-- Determines if a character is typically rendered in a double-width cell. 372 373-- c : a Unicode character 374-- Returns : TRUE if the character is wide 375 376-- --------------------------------------------------------------------------------- 377 378-- g_unichar_iswide_cjk () 379 380-- gboolean g_unichar_iswide_cjk (gunichar c); 381 382-- Determines if a character is typically rendered in a double-width cell under 383-- legacy East Asian locales. If a character is wide according to 384-- g_unichar_iswide(), then it is also reported wide with this function, but the 385-- converse is not necessarily true. See the Unicode Standard Annex 11 for details. 386 387-- c : a Unicode character 388-- Returns : TRUE if the character is wide in legacy East Asian locales 389 390-- Since 2.12 391 392-- --------------------------------------------------------------------------------- 393 394-- g_unichar_toupper () 395 396-- gunichar g_unichar_toupper (gunichar c); 397 398-- Converts a character to uppercase. 399 400-- c : a Unicode character 401-- Returns : the result of converting c to uppercase. If c is not an lowercase or 402-- titlecase character, or has no upper case equivalent c is returned 403-- unchanged. 404 405-- --------------------------------------------------------------------------------- 406 407-- g_unichar_tolower () 408 409-- gunichar g_unichar_tolower (gunichar c); 410 411-- Converts a character to lower case. 412 413-- c : a Unicode character. 414-- Returns : the result of converting c to lower case. If c is not an upperlower or 415-- titlecase character, or has no lowercase equivalent c is returned 416-- unchanged. 417 418-- --------------------------------------------------------------------------------- 419 420-- g_unichar_totitle () 421 422-- gunichar g_unichar_totitle (gunichar c); 423 424-- Converts a character to the titlecase. 425 426-- c : a Unicode character 427-- Returns : the result of converting c to titlecase. If c is not an uppercase or 428-- lowercase character, c is returned unchanged. 429 430-- --------------------------------------------------------------------------------- 431 432-- g_unichar_digit_value () 433 434-- gint g_unichar_digit_value (gunichar c); 435 436-- Determines the numeric value of a character as a decimal digit. 437 438-- c : a Unicode character 439-- Returns : If c is a decimal digit (according to g_unichar_isdigit()), its numeric 440-- value. Otherwise, -1. 441 442-- --------------------------------------------------------------------------------- 443 444-- g_unichar_xdigit_value () 445 446-- gint g_unichar_xdigit_value (gunichar c); 447 448-- Determines the numeric value of a character as a hexidecimal digit. 449 450-- c : a Unicode character 451-- Returns : If c is a hex digit (according to g_unichar_isxdigit()), its numeric 452-- value. Otherwise, -1. 453 454-- --------------------------------------------------------------------------------- 455 456-- enum GUnicodeType 457 458-- typedef enum 459-- { 460-- G_UNICODE_CONTROL, 461-- G_UNICODE_FORMAT, 462-- G_UNICODE_UNASSIGNED, 463-- G_UNICODE_PRIVATE_USE, 464-- G_UNICODE_SURROGATE, 465-- G_UNICODE_LOWERCASE_LETTER, 466-- G_UNICODE_MODIFIER_LETTER, 467-- G_UNICODE_OTHER_LETTER, 468-- G_UNICODE_TITLECASE_LETTER, 469-- G_UNICODE_UPPERCASE_LETTER, 470-- G_UNICODE_COMBINING_MARK, 471-- G_UNICODE_ENCLOSING_MARK, 472-- G_UNICODE_NON_SPACING_MARK, 473-- G_UNICODE_DECIMAL_NUMBER, 474-- G_UNICODE_LETTER_NUMBER, 475-- G_UNICODE_OTHER_NUMBER, 476-- G_UNICODE_CONNECT_PUNCTUATION, 477-- G_UNICODE_DASH_PUNCTUATION, 478-- G_UNICODE_CLOSE_PUNCTUATION, 479-- G_UNICODE_FINAL_PUNCTUATION, 480-- G_UNICODE_INITIAL_PUNCTUATION, 481-- G_UNICODE_OTHER_PUNCTUATION, 482-- G_UNICODE_OPEN_PUNCTUATION, 483-- G_UNICODE_CURRENCY_SYMBOL, 484-- G_UNICODE_MODIFIER_SYMBOL, 485-- G_UNICODE_MATH_SYMBOL, 486-- G_UNICODE_OTHER_SYMBOL, 487-- G_UNICODE_LINE_SEPARATOR, 488-- G_UNICODE_PARAGRAPH_SEPARATOR, 489-- G_UNICODE_SPACE_SEPARATOR 490-- } GUnicodeType; 491 492-- These are the possible character classifications. See 493-- http://www.unicode.org/Public/UNIDATA/UnicodeData.html. 494 495-- --------------------------------------------------------------------------------- 496 497-- g_unichar_type () 498 499-- GUnicodeType g_unichar_type (gunichar c); 500 501-- Classifies a Unicode character by type. 502 503-- c : a Unicode character 504-- Returns : the type of the character. 505 506-- --------------------------------------------------------------------------------- 507 508-- enum GUnicodeBreakType 509 510-- typedef enum 511-- { 512-- G_UNICODE_BREAK_MANDATORY, 513-- G_UNICODE_BREAK_CARRIAGE_RETURN, 514-- G_UNICODE_BREAK_LINE_FEED, 515-- G_UNICODE_BREAK_COMBINING_MARK, 516-- G_UNICODE_BREAK_SURROGATE, 517-- G_UNICODE_BREAK_ZERO_WIDTH_SPACE, 518-- G_UNICODE_BREAK_INSEPARABLE, 519-- G_UNICODE_BREAK_NON_BREAKING_GLUE, 520-- G_UNICODE_BREAK_CONTINGENT, 521-- G_UNICODE_BREAK_SPACE, 522-- G_UNICODE_BREAK_AFTER, 523-- G_UNICODE_BREAK_BEFORE, 524-- G_UNICODE_BREAK_BEFORE_AND_AFTER, 525-- G_UNICODE_BREAK_HYPHEN, 526-- G_UNICODE_BREAK_NON_STARTER, 527-- G_UNICODE_BREAK_OPEN_PUNCTUATION, 528-- G_UNICODE_BREAK_CLOSE_PUNCTUATION, 529-- G_UNICODE_BREAK_QUOTATION, 530-- G_UNICODE_BREAK_EXCLAMATION, 531-- G_UNICODE_BREAK_IDEOGRAPHIC, 532-- G_UNICODE_BREAK_NUMERIC, 533-- G_UNICODE_BREAK_INFIX_SEPARATOR, 534-- G_UNICODE_BREAK_SYMBOL, 535-- G_UNICODE_BREAK_ALPHABETIC, 536-- G_UNICODE_BREAK_PREFIX, 537-- G_UNICODE_BREAK_POSTFIX, 538-- G_UNICODE_BREAK_COMPLEX_CONTEXT, 539-- G_UNICODE_BREAK_AMBIGUOUS, 540-- G_UNICODE_BREAK_UNKNOWN, 541-- G_UNICODE_BREAK_NEXT_LINE, 542-- G_UNICODE_BREAK_WORD_JOINER, 543-- G_UNICODE_BREAK_HANGUL_L_JAMO, 544-- G_UNICODE_BREAK_HANGUL_V_JAMO, 545-- G_UNICODE_BREAK_HANGUL_T_JAMO, 546-- G_UNICODE_BREAK_HANGUL_LV_SYLLABLE, 547-- G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE 548-- } GUnicodeBreakType; 549 550-- These are the possible line break classifications. The five Hangul types were 551-- added in Unicode 4.1, so, has been introduced in GLib 2.10. Note that new types 552-- may be added in the future. Applications should be ready to handle unknown 553-- values. They may be regarded as G_UNICODE_BREAK_UNKNOWN. See 554-- http://www.unicode.org/unicode/reports/tr14/. 555 556-- --------------------------------------------------------------------------------- 557 558-- g_unichar_break_type () 559 560-- GUnicodeBreakType g_unichar_break_type (gunichar c); 561 562-- Determines the break type of c. c should be a Unicode character (to derive a 563-- character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used 564-- to find word and line breaks ("text boundaries"), Pango implements the Unicode 565-- boundary resolution algorithms and normally you would use a function such as 566-- pango_break() instead of caring about break types yourself. 567 568-- c : a Unicode character 569-- Returns : the break type of c 570 571-- --------------------------------------------------------------------------------- 572 573-- g_unicode_canonical_ordering () 574 575-- void g_unicode_canonical_ordering (gunichar *string, 576-- gsize len); 577 578-- Computes the canonical ordering of a string in-place. This rearranges decomposed 579-- characters in the string according to their combining classes. See the Unicode 580-- manual for more information. 581 582-- string : a UCS-4 encoded string. 583-- len : the maximum length of string to use. 584 585-- --------------------------------------------------------------------------------- 586 587-- g_unicode_canonical_decomposition () 588 589-- gunichar* g_unicode_canonical_decomposition 590-- (gunichar ch, 591-- gsize *result_len); 592 593-- Computes the canonical decomposition of a Unicode character. 594 595-- ch : a Unicode character. 596-- result_len : location to store the length of the return value. 597-- Returns : a newly allocated string of Unicode characters. result_len is set to 598-- the resulting length of the string. 599 600-- --------------------------------------------------------------------------------- 601 602-- g_unichar_get_mirror_char () 603 604-- gboolean g_unichar_get_mirror_char (gunichar ch, 605-- gunichar *mirrored_ch); 606 607-- In Unicode, some characters are mirrored. This means that their images are 608-- mirrored horizontally in text that is laid out from right to left. For instance, 609-- "(" would become its mirror image, ")", in right-to-left text. 610 611-- If ch has the Unicode mirrored property and there is another unicode character 612-- that typically has a glyph that is the mirror image of ch's glyph and mirrored_ch 613-- is set, it puts that character in the address pointed to by mirrored_ch. 614-- Otherwise the original character is put. 615 616-- ch : a Unicode character 617-- mirrored_ch : location to store the mirrored character 618-- Returns : TRUE if ch has a mirrored character, FALSE otherwise 619 620-- Since 2.4 621 622-- --------------------------------------------------------------------------------- 623 624-- g_utf8_next_char() 625 626-- #define g_utf8_next_char(p) 627 628-- Skips to the next character in a UTF-8 string. The string must be valid; this 629-- macro is as fast as possible, and has no error-checking. You would use this macro 630-- to iterate over a string character by character. The macro returns the start of 631-- the next UTF-8 character. Before using this macro, use g_utf8_validate() to 632-- validate strings that may contain invalid UTF-8. 633 634-- p : Pointer to the start of a valid UTF-8 character. 635 636-- --------------------------------------------------------------------------------- 637 638-- g_utf8_get_char () 639 640-- gunichar g_utf8_get_char (const gchar *p); 641 642-- Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does 643-- not point to a valid UTF-8 encoded character, results are undefined. If you are 644-- not sure that the bytes are complete valid Unicode characters, you should use 645-- g_utf8_get_char_validated() instead. 646 647-- p : a pointer to Unicode character encoded as UTF-8 648-- Returns : the resulting character 649 650-- --------------------------------------------------------------------------------- 651 652-- g_utf8_get_char_validated () 653 654-- gunichar g_utf8_get_char_validated (const gchar *p, 655-- gssize max_len); 656 657-- Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This 658-- function checks for incomplete characters, for invalid characters such as 659-- characters that are out of the range of Unicode, and for overlong encodings of 660-- valid characters. 661 662-- p : a pointer to Unicode character encoded as UTF-8 663-- max_len : the maximum number of bytes to read, or -1, for no maximum. 664-- Returns : the resulting character. If p points to a partial sequence at the end 665-- of a string that could begin a valid character, returns (gunichar)-2; 666-- otherwise, if p does not point to a valid UTF-8 encoded Unicode 667-- character, returns (gunichar)-1. 668 669-- --------------------------------------------------------------------------------- 670 671-- g_utf8_offset_to_pointer () 672 673-- gchar* g_utf8_offset_to_pointer (const gchar *str, 674-- glong offset); 675 676-- Converts from an integer character offset to a pointer to a position within the 677-- string. 678 679-- Since 2.10, this function allows to pass a negative offset to step backwards. It 680-- is usually worth stepping backwards from the end instead of forwards if offset is 681-- in the last fourth of the string, since moving forward is about 3 times faster 682-- than moving backward. 683 684-- str : a UTF-8 encoded string 685-- offset : a character offset within str 686-- Returns : the resulting pointer 687 688-- --------------------------------------------------------------------------------- 689 690-- g_utf8_pointer_to_offset () 691 692-- glong g_utf8_pointer_to_offset (const gchar *str, 693-- const gchar *pos); 694 695-- Converts from a pointer to position within a string to a integer character 696-- offset. 697 698-- Since 2.10, this function allows pos to be before str, and returns a negative 699-- offset in this case. 700 701-- str : a UTF-8 encoded string 702-- pos : a pointer to a position within str 703-- Returns : the resulting character offset 704 705-- --------------------------------------------------------------------------------- 706 707-- g_utf8_prev_char () 708 709-- gchar* g_utf8_prev_char (const gchar *p); 710 711-- Finds the previous UTF-8 character in the string before p. 712 713-- p does not have to be at the beginning of a UTF-8 character. No check is made to 714-- see if the character found is actually valid other than it starts with an 715-- appropriate byte. If p might be the first character of the string, you must use 716-- g_utf8_find_prev_char() instead. 717 718-- p : a pointer to a position within a UTF-8 encoded string 719-- Returns : a pointer to the found character. 720 721-- --------------------------------------------------------------------------------- 722 723-- g_utf8_find_next_char () 724 725-- gchar* g_utf8_find_next_char (const gchar *p, 726-- const gchar *end); 727 728-- Finds the start of the next UTF-8 character in the string after p. 729 730-- p does not have to be at the beginning of a UTF-8 character. No check is made to 731-- see if the character found is actually valid other than it starts with an 732-- appropriate byte. 733 734-- p : a pointer to a position within a UTF-8 encoded string 735-- end : a pointer to the end of the string, or NULL to indicate that the string 736-- is nul-terminated, in which case the returned value will be 737-- Returns : a pointer to the found character or NULL 738 739-- --------------------------------------------------------------------------------- 740 741-- g_utf8_find_prev_char () 742 743-- gchar* g_utf8_find_prev_char (const gchar *str, 744-- const gchar *p); 745 746-- Given a position p with a UTF-8 encoded string str, find the start of the 747-- previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters 748-- are present in str before p. 749 750-- p does not have to be at the beginning of a UTF-8 character. No check is made to 751-- see if the character found is actually valid other than it starts with an 752-- appropriate byte. 753 754-- str : pointer to the beginning of a UTF-8 encoded string 755-- p : pointer to some position within str 756-- Returns : a pointer to the found character or NULL. 757 758-- --------------------------------------------------------------------------------- 759 760-- g_utf8_strlen () 761 762-- glong g_utf8_strlen (const gchar *p, 763-- gssize max); 764 765-- Returns the length of the string in characters. 766 767-- p : pointer to the start of a UTF-8 encoded string. 768-- max : the maximum number of bytes to examine. If max is less than 0, then the 769-- string is assumed to be nul-terminated. If max is 0, p will not be 770-- examined and may be NULL. 771-- Returns : the length of the string in characters 772 773-- --------------------------------------------------------------------------------- 774 775-- g_utf8_strncpy () 776 777-- gchar* g_utf8_strncpy (gchar *dest, 778-- const gchar *src, 779-- gsize n); 780 781-- Like the standard C strncpy() function, but copies a given number of characters 782-- instead of a given number of bytes. The src string must be valid UTF-8 encoded 783-- text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility 784-- functions with it.) 785 786-- dest : buffer to fill with characters from src 787-- src : UTF-8 encoded string 788-- n : character count 789-- Returns : dest 790 791-- --------------------------------------------------------------------------------- 792 793-- g_utf8_strchr () 794 795-- gchar* g_utf8_strchr (const gchar *p, 796-- gssize len, 797-- gunichar c); 798 799-- Finds the leftmost occurrence of the given Unicode character in a UTF-8 encoded 800-- string, while limiting the search to len bytes. If len is -1, allow unbounded 801-- search. 802 803-- p : a nul-terminated UTF-8 encoded string 804-- len : the maximum length of p 805-- c : a Unicode character 806-- Returns : NULL if the string does not contain the character, otherwise, a pointer 807-- to the start of the leftmost occurrence of the character in the string. 808 809-- --------------------------------------------------------------------------------- 810 811-- g_utf8_strrchr () 812 813-- gchar* g_utf8_strrchr (const gchar *p, 814-- gssize len, 815-- gunichar c); 816 817-- Find the rightmost occurrence of the given Unicode character in a UTF-8 encoded 818-- string, while limiting the search to len bytes. If len is -1, allow unbounded 819-- search. 820 821-- p : a nul-terminated UTF-8 encoded string 822-- len : the maximum length of p 823-- c : a Unicode character 824-- Returns : NULL if the string does not contain the character, otherwise, a pointer 825-- to the start of the rightmost occurrence of the character in the 826-- string. 827 828-- --------------------------------------------------------------------------------- 829 830-- g_utf8_strreverse () 831 832-- gchar* g_utf8_strreverse (const gchar *str, 833-- gssize len); 834 835-- Reverses a UTF-8 string. str must be valid UTF-8 encoded text. (Use 836-- g_utf8_validate() on all text before trying to use UTF-8 utility functions with 837-- it.) 838 839-- Note that unlike g_strreverse(), this function returns newly-allocated memory, 840-- which should be freed with g_free() when no longer needed. 841 842-- str : a UTF-8 encoded string 843-- len : the maximum length of str to use. If len < 0, then the string is 844-- nul-terminated. 845-- Returns : a newly-allocated string which is the reverse of str. 846 847-- Since 2.2 848 849-- --------------------------------------------------------------------------------- 850 851-- g_utf8_validate () 852 853-- gboolean g_utf8_validate (const gchar *str, 854-- gssize max_len, 855-- const gchar **end); 856 857-- Validates UTF-8 encoded text. str is the text to validate; if str is 858-- nul-terminated, then max_len can be -1, otherwise max_len should be the number of 859-- bytes to validate. If end is non-NULL, then the end of the valid range will be 860-- stored there (i.e. the start of the first invalid character if some bytes were 861-- invalid, or the end of the text being validated otherwise). 862 863-- Note that g_utf8_validate() returns FALSE if max_len is positive and NUL is met 864-- before max_len bytes have been read. 865 866-- Returns TRUE if all of str was valid. Many GLib and GTK+ routines require valid 867-- UTF-8 as input; so data read from a file or the network should be checked with 868-- g_utf8_validate() before doing anything else with it. 869 870-- str : a pointer to character data 871-- max_len : max bytes to validate, or -1 to go until NUL 872-- end : return location for end of valid data 873-- Returns : TRUE if the text was valid UTF-8 874 875-- --------------------------------------------------------------------------------- 876 877-- g_utf8_strup () 878 879-- gchar* g_utf8_strup (const gchar *str, 880-- gssize len); 881 882-- Converts all Unicode characters in the string that have a case to uppercase. The 883-- exact manner that this is done depends on the current locale, and may result in 884-- the number of characters in the string increasing. (For instance, the German 885-- ess-zet will be changed to SS.) 886 887-- str : a UTF-8 encoded string 888-- len : length of str, in bytes, or -1 if str is nul-terminated. 889-- Returns : a newly allocated string, with all characters converted to uppercase. 890 891-- --------------------------------------------------------------------------------- 892 893-- g_utf8_strdown () 894 895-- gchar* g_utf8_strdown (const gchar *str, 896-- gssize len); 897 898-- Converts all Unicode characters in the string that have a case to lowercase. The 899-- exact manner that this is done depends on the current locale, and may result in 900-- the number of characters in the string changing. 901 902-- str : a UTF-8 encoded string 903-- len : length of str, in bytes, or -1 if str is nul-terminated. 904-- Returns : a newly allocated string, with all characters converted to lowercase. 905 906-- --------------------------------------------------------------------------------- 907 908-- g_utf8_casefold () 909 910-- gchar* g_utf8_casefold (const gchar *str, 911-- gssize len); 912 913-- Converts a string into a form that is independent of case. The result will not 914-- correspond to any particular case, but can be compared for equality or ordered 915-- with the results of calling g_utf8_casefold() on other strings. 916 917-- Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an 918-- approximation to the correct linguistic case insensitive ordering, though it is a 919-- fairly good one. Getting this exactly right would require a more sophisticated 920-- collation function that takes case sensitivity into account. GLib does not 921-- currently provide such a function. 922 923-- str : a UTF-8 encoded string 924-- len : length of str, in bytes, or -1 if str is nul-terminated. 925-- Returns : a newly allocated string, that is a case independent form of str. 926 927-- --------------------------------------------------------------------------------- 928 929-- g_utf8_normalize () 930 931-- gchar* g_utf8_normalize (const gchar *str, 932-- gssize len, 933-- GNormalizeMode mode); 934 935-- Converts a string into canonical form, standardizing such issues as whether a 936-- character with an accent is represented as a base character and combining accent 937-- or as a single precomposed character. You should generally call 938-- g_utf8_normalize() before comparing two Unicode strings. 939 940-- The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do 941-- not affect the text content, such as the above-mentioned accent representation. 942-- G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such 943-- as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting 944-- information may be lost but for most text operations such characters should be 945-- considered the same. For example, g_utf8_collate() normalizes with 946-- G_NORMALIZE_ALL as its first step. 947 948-- G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like 949-- G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed 950-- forms rather than a maximally decomposed form. This is often useful if you intend 951-- to convert the string to a legacy encoding or pass it to a system with less 952-- capable Unicode handling. 953 954-- str : a UTF-8 encoded string. 955-- len : length of str, in bytes, or -1 if str is nul-terminated. 956-- mode : the type of normalization to perform. 957-- Returns : a newly allocated string, that is the normalized form of str. 958 959-- --------------------------------------------------------------------------------- 960 961-- enum GNormalizeMode 962 963-- typedef enum { 964-- G_NORMALIZE_DEFAULT, 965-- G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, 966-- G_NORMALIZE_DEFAULT_COMPOSE, 967-- G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, 968-- G_NORMALIZE_ALL, 969-- G_NORMALIZE_NFKD = G_NORMALIZE_ALL, 970-- G_NORMALIZE_ALL_COMPOSE, 971-- G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE 972-- } GNormalizeMode; 973 974-- Defines how a Unicode string is transformed in a canonical form, standardizing 975-- such issues as whether a character with an accent is represented as a base 976-- character and combining accent or as a single precomposed character. Unicode 977-- strings should generally be normalized before comparing them. 978 979-- G_NORMALIZE_DEFAULT standardize differences that do not affect the text 980-- content, such as the above-mentioned accent 981-- representation. 982-- G_NORMALIZE_NFD another name for G_NORMALIZE_DEFAULT. 983-- G_NORMALIZE_DEFAULT_COMPOSE like G_NORMALIZE_DEFAULT, but with composed forms 984-- rather than a maximally decomposed form. 985-- G_NORMALIZE_NFC another name for G_NORMALIZE_DEFAULT_COMPOSE. 986-- G_NORMALIZE_ALL beyond G_NORMALIZE_DEFAULT also standardize the 987-- "compatibility" characters in Unicode, such as 988-- SUPERSCRIPT THREE to the standard forms (in this case 989-- DIGIT THREE). Formatting information may be lost but 990-- for most text operations such characters should be 991-- considered the same. 992-- G_NORMALIZE_NFKD another name for G_NORMALIZE_ALL. 993-- G_NORMALIZE_ALL_COMPOSE like G_NORMALIZE_ALL, but with composed forms rather 994-- than a maximally decomposed form. 995-- G_NORMALIZE_NFKC another name for G_NORMALIZE_ALL_COMPOSE. 996 997-- --------------------------------------------------------------------------------- 998 999-- g_utf8_collate () 1000 1001-- gint g_utf8_collate (const gchar *str1, 1002-- const gchar *str2); 1003 1004-- Compares two strings for ordering using the linguistically correct rules for the 1005-- current locale. When sorting a large number of strings, it will be significantly 1006-- faster to obtain collation keys with g_utf8_collate_key() and compare the keys 1007-- with strcmp() when sorting instead of sorting the original strings. 1008 1009-- str1 : a UTF-8 encoded string 1010-- str2 : a UTF-8 encoded string 1011-- Returns : < 0 if str1 compares before str2, 0 if they compare equal, > 0 if str1 1012-- compares after str2. 1013 1014-- --------------------------------------------------------------------------------- 1015 1016-- g_utf8_collate_key () 1017 1018-- gchar* g_utf8_collate_key (const gchar *str, 1019-- gssize len); 1020 1021-- Converts a string into a collation key that can be compared with other collation 1022-- keys produced by the same function using strcmp(). The results of comparing the 1023-- collation keys of two strings with strcmp() will always be the same as comparing 1024-- the two original keys with g_utf8_collate(). 1025 1026-- str : a UTF-8 encoded string. 1027-- len : length of str, in bytes, or -1 if str is nul-terminated. 1028-- Returns : a newly allocated string. This string should be freed with g_free() 1029-- when you are done with it. 1030 1031-- --------------------------------------------------------------------------------- 1032 1033-- g_utf8_collate_key_for_filename () 1034 1035-- gchar* g_utf8_collate_key_for_filename (const gchar *str, 1036-- gssize len); 1037 1038-- Converts a string into a collation key that can be compared with other collation 1039-- keys produced by the same function using strcmp(). 1040 1041-- In order to sort filenames correctly, this function treats the dot '.' as a 1042-- special case. Most dictionary orderings seem to consider it insignificant, thus 1043-- producing the ordering "event.c" "eventgenerator.c" "event.h" instead of 1044-- "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers 1045-- intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5" 1046-- "file10". 1047 1048-- str : a UTF-8 encoded string. 1049-- len : length of str, in bytes, or -1 if str is nul-terminated. 1050-- Returns : a newly allocated string. This string should be freed with g_free() 1051-- when you are done with it. 1052 1053-- Since 2.8 1054 1055-- --------------------------------------------------------------------------------- 1056 1057-- g_utf8_to_utf16 () 1058 1059-- gunichar2* g_utf8_to_utf16 (const gchar *str, 1060-- glong len, 1061-- glong *items_read, 1062-- glong *items_written, 1063-- GError **error); 1064 1065-- Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result 1066-- after the converted text. 1067 1068-- str : a UTF-8 encoded string 1069-- len : the maximum length (number of characters) of str to use. If len < 1070-- 0, then the string is nul-terminated. 1071-- items_read : location to store number of bytes read, or NULL. If NULL, then 1072-- G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str 1073-- contains a trailing partial character. If an error occurs then 1074-- the index of the invalid input is stored here. 1075-- items_written : location to store number of gunichar2 written, or NULL. The value 1076-- stored here does not include the trailing 0. 1077-- error : location to store the error occuring, or NULL to ignore errors. 1078-- Any of the errors in GConvertError other than 1079-- G_CONVERT_ERROR_NO_CONVERSION may occur. 1080-- Returns : a pointer to a newly allocated UTF-16 string. This value must be 1081-- freed with g_free(). If an error occurs, NULL will be returned 1082-- and error set. 1083 1084-- --------------------------------------------------------------------------------- 1085 1086-- g_utf8_to_ucs4 () 1087 1088-- gunichar* g_utf8_to_ucs4 (const gchar *str, 1089-- glong len, 1090-- glong *items_read, 1091-- glong *items_written, 1092-- GError **error); 1093 1094-- Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A 1095-- trailing 0 will be added to the string after the converted text. 1096 1097-- str : a UTF-8 encoded string 1098-- len : the maximum length of str to use. If len < 0, then the string is 1099-- nul-terminated. 1100-- items_read : location to store number of bytes read, or NULL. If NULL, then 1101-- G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str 1102-- contains a trailing partial character. If an error occurs then 1103-- the index of the invalid input is stored here. 1104-- items_written : location to store number of characters written or NULL. The value 1105-- here stored does not include the trailing 0 character. 1106-- error : location to store the error occuring, or NULL to ignore errors. 1107-- Any of the errors in GConvertError other than 1108-- G_CONVERT_ERROR_NO_CONVERSION may occur. 1109-- Returns : a pointer to a newly allocated UCS-4 string. This value must be 1110-- freed with g_free(). If an error occurs, NULL will be returned 1111-- and error set. 1112 1113-- --------------------------------------------------------------------------------- 1114 1115-- g_utf8_to_ucs4_fast () 1116 1117-- gunichar* g_utf8_to_ucs4_fast (const gchar *str, 1118-- glong len, 1119-- glong *items_written); 1120 1121-- Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4, 1122-- assuming valid UTF-8 input. This function is roughly twice as fast as 1123-- g_utf8_to_ucs4() but does no error checking on the input. 1124 1125-- str : a UTF-8 encoded string 1126-- len : the maximum length of str to use. If len < 0, then the string is 1127-- nul-terminated. 1128-- items_written : location to store the number of characters in the result, or 1129-- NULL. 1130-- Returns : a pointer to a newly allocated UCS-4 string. This value must be 1131-- freed with g_free(). 1132 1133-- --------------------------------------------------------------------------------- 1134 1135-- g_utf16_to_ucs4 () 1136 1137-- gunichar* g_utf16_to_ucs4 (const gunichar2 *str, 1138-- glong len, 1139-- glong *items_read, 1140-- glong *items_written, 1141-- GError **error); 1142 1143-- Convert a string from UTF-16 to UCS-4. The result will be terminated with a 0 1144-- character. 1145 1146-- str : a UTF-16 encoded string 1147-- len : the maximum length (number of gunichar2) of str to use. If len < 1148…
Large files files are truncated, but you can click here to view the full file