PageRenderTime 34ms CodeModel.GetById 17ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 1ms

/src/wrappers/glib/library/utilities/glib_unicode_manipulation.e

http://github.com/tybor/Liberty
Specman e | 1278 lines | 35 code | 338 blank | 905 comment | 2 complexity | b03c41098c1a40f12d152ad7cc22961f MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1indexing
   2	description: "C string Utility Functions -- various C-string-related functions."
   3	copyright: "[
   4					Copyright (C) 2007 Paolo Redaelli, Anthony Lenton,
   5					                   Soluciones Informaticas Libres S.A., GLib team
   6					
   7					This library is free software; you can redistribute it and/or
   8					modify it under the terms of the GNU Lesser General Public License
   9					as published by the Free Software Foundation; either version 2.1 of
  10					the License, or (at your option) any later version.
  11					
  12					This library is distributed in the hopeOA that it will be useful, but
  13					WITHOUT ANY WARRANTY; without even the implied warranty of
  14					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15					Lesser General Public License for more details.
  16
  17					You should have received a copy of the GNU Lesser General Public
  18					License along with this library; if not, write to the Free Software
  19					Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20					02110-1301 USA
  21			]"
  22
  23deferred class GLIB_UNICODE_MANIPULATION
  24
  25insert
  26	ANY undefine is_equal, copy end
  27
  28feature {} -- Utility functions, inherit them if you need them
  29
  30--    Unicode Manipulation -- functions operating on Unicode characters and UTF-8
  31--    strings.
  32
  33-- Synopsis
  34
  35
  36--  #include <glib.h>
  37
  38
  39--  typedef     gunichar;
  40--  typedef     gunichar2;
  41
  42--  gboolean    g_unichar_validate              (gunichar ch);
  43--  gboolean    g_unichar_isalnum               (gunichar c);
  44--  gboolean    g_unichar_isalpha               (gunichar c);
  45--  gboolean    g_unichar_iscntrl               (gunichar c);
  46--  gboolean    g_unichar_isdigit               (gunichar c);
  47--  gboolean    g_unichar_isgraph               (gunichar c);
  48--  gboolean    g_unichar_islower               (gunichar c);
  49--  gboolean    g_unichar_isprint               (gunichar c);
  50--  gboolean    g_unichar_ispunct               (gunichar c);
  51--  gboolean    g_unichar_isspace               (gunichar c);
  52--  gboolean    g_unichar_isupper               (gunichar c);
  53--  gboolean    g_unichar_isxdigit              (gunichar c);
  54--  gboolean    g_unichar_istitle               (gunichar c);
  55--  gboolean    g_unichar_isdefined             (gunichar c);
  56--  gboolean    g_unichar_iswide                (gunichar c);
  57--  gboolean    g_unichar_iswide_cjk            (gunichar c);
  58--  gunichar    g_unichar_toupper               (gunichar c);
  59--  gunichar    g_unichar_tolower               (gunichar c);
  60--  gunichar    g_unichar_totitle               (gunichar c);
  61--  gint        g_unichar_digit_value           (gunichar c);
  62--  gint        g_unichar_xdigit_value          (gunichar c);
  63--  enum        GUnicodeType;
  64--  GUnicodeType g_unichar_type                 (gunichar c);
  65--  enum        GUnicodeBreakType;
  66--  GUnicodeBreakType g_unichar_break_type      (gunichar c);
  67--  void        g_unicode_canonical_ordering    (gunichar *string,
  68--                                               gsize len);
  69--  gunichar*   g_unicode_canonical_decomposition
  70--                                              (gunichar ch,
  71--                                               gsize *result_len);
  72--  gboolean    g_unichar_get_mirror_char       (gunichar ch,
  73--                                               gunichar *mirrored_ch);
  74
  75--  #define     g_utf8_next_char                (p)
  76--  gunichar    g_utf8_get_char                 (const gchar *p);
  77--  gunichar    g_utf8_get_char_validated       (const gchar *p,
  78--                                               gssize max_len);
  79--  gchar*      g_utf8_offset_to_pointer        (const gchar *str,
  80--                                               glong offset);
  81--  glong       g_utf8_pointer_to_offset        (const gchar *str,
  82--                                               const gchar *pos);
  83--  gchar*      g_utf8_prev_char                (const gchar *p);
  84--  gchar*      g_utf8_find_next_char           (const gchar *p,
  85--                                               const gchar *end);
  86--  gchar*      g_utf8_find_prev_char           (const gchar *str,
  87--                                               const gchar *p);
  88--  glong       g_utf8_strlen                   (const gchar *p,
  89--                                               gssize max);
  90--  gchar*      g_utf8_strncpy                  (gchar *dest,
  91--                                               const gchar *src,
  92--                                               gsize n);
  93--  gchar*      g_utf8_strchr                   (const gchar *p,
  94--                                               gssize len,
  95--                                               gunichar c);
  96--  gchar*      g_utf8_strrchr                  (const gchar *p,
  97--                                               gssize len,
  98--                                               gunichar c);
  99--  gchar*      g_utf8_strreverse               (const gchar *str,
 100--                                               gssize len);
 101--  gboolean    g_utf8_validate                 (const gchar *str,
 102--                                               gssize max_len,
 103--                                               const gchar **end);
 104
 105--  gchar*      g_utf8_strup                    (const gchar *str,
 106--                                               gssize len);
 107--  gchar*      g_utf8_strdown                  (const gchar *str,
 108--                                               gssize len);
 109--  gchar*      g_utf8_casefold                 (const gchar *str,
 110--                                               gssize len);
 111--  gchar*      g_utf8_normalize                (const gchar *str,
 112--                                               gssize len,
 113--                                               GNormalizeMode mode);
 114--  enum        GNormalizeMode;
 115--  gint        g_utf8_collate                  (const gchar *str1,
 116--                                               const gchar *str2);
 117--  gchar*      g_utf8_collate_key              (const gchar *str,
 118--                                               gssize len);
 119--  gchar*      g_utf8_collate_key_for_filename (const gchar *str,
 120--                                               gssize len);
 121
 122--  gunichar2*  g_utf8_to_utf16                 (const gchar *str,
 123--                                               glong len,
 124--                                               glong *items_read,
 125--                                               glong *items_written,
 126--                                               GError **error);
 127--  gunichar*   g_utf8_to_ucs4                  (const gchar *str,
 128--                                               glong len,
 129--                                               glong *items_read,
 130--                                               glong *items_written,
 131--                                               GError **error);
 132--  gunichar*   g_utf8_to_ucs4_fast             (const gchar *str,
 133--                                               glong len,
 134--                                               glong *items_written);
 135--  gunichar*   g_utf16_to_ucs4                 (const gunichar2 *str,
 136--                                               glong len,
 137--                                               glong *items_read,
 138--                                               glong *items_written,
 139--                                               GError **error);
 140--  gchar*      g_utf16_to_utf8                 (const gunichar2 *str,
 141--                                               glong len,
 142--                                               glong *items_read,
 143--                                               glong *items_written,
 144--                                               GError **error);
 145--  gunichar2*  g_ucs4_to_utf16                 (const gunichar *str,
 146--                                               glong len,
 147--                                               glong *items_read,
 148--                                               glong *items_written,
 149--                                               GError **error);
 150--  gchar*      g_ucs4_to_utf8                  (const gunichar *str,
 151--                                               glong len,
 152--                                               glong *items_read,
 153--                                               glong *items_written,
 154--                                               GError **error);
 155--  gint        g_unichar_to_utf8               (gunichar c,
 156--                                               gchar *outbuf);
 157
 158-- Description
 159
 160--    This section describes a number of functions for dealing with Unicode characters
 161--    and strings. There are analogues of the traditional ctype.h character
 162--    classification and case conversion functions, UTF-8 analogues of some string
 163--    utility functions, functions to perform normalization, case conversion and
 164--    collation on UTF-8 strings and finally functions to convert between the UTF-8,
 165--    UTF-16 and UCS-4 encodings of Unicode.
 166
 167--    The implementations of the Unicode functions in GLib are based on the Unicode
 168--    Character Data tables, which are available from www.unicode.org. GLib 2.8
 169--    supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode
 170--    5.0.
 171
 172-- Details
 173
 174--   gunichar
 175
 176--  typedef guint32 gunichar;
 177
 178--    A type which can hold any UCS-4 character code.
 179
 180--    ---------------------------------------------------------------------------------
 181
 182--   gunichar2
 183
 184--  typedef guint16 gunichar2;
 185
 186--    A type which can hold any UTF-16 code point^[3].
 187
 188--    ---------------------------------------------------------------------------------
 189
 190--   g_unichar_validate ()
 191
 192--  gboolean    g_unichar_validate              (gunichar ch);
 193
 194--    Checks whether ch is a valid Unicode character. Some possible integer values of
 195--    ch will not be valid. 0 is considered a valid character, though it's normally a
 196--    string terminator.
 197
 198--    ch :      a Unicode character
 199--    Returns : TRUE if ch is a valid Unicode character
 200
 201--    ---------------------------------------------------------------------------------
 202
 203--   g_unichar_isalnum ()
 204
 205--  gboolean    g_unichar_isalnum               (gunichar c);
 206
 207--    Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a
 208--    character value with g_utf8_get_char().
 209
 210--    c :       a Unicode character
 211--    Returns : TRUE if c is an alphanumeric character
 212
 213--    ---------------------------------------------------------------------------------
 214
 215--   g_unichar_isalpha ()
 216
 217--  gboolean    g_unichar_isalpha               (gunichar c);
 218
 219--    Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8
 220--    text, obtain a character value with g_utf8_get_char().
 221
 222--    c :       a Unicode character
 223--    Returns : TRUE if c is an alphabetic character
 224
 225--    ---------------------------------------------------------------------------------
 226
 227--   g_unichar_iscntrl ()
 228
 229--  gboolean    g_unichar_iscntrl               (gunichar c);
 230
 231--    Determines whether a character is a control character. Given some UTF-8 text,
 232--    obtain a character value with g_utf8_get_char().
 233
 234--    c :       a Unicode character
 235--    Returns : TRUE if c is a control character
 236
 237--    ---------------------------------------------------------------------------------
 238
 239--   g_unichar_isdigit ()
 240
 241--  gboolean    g_unichar_isdigit               (gunichar c);
 242
 243--    Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9
 244--    and also digits in other languages/scripts. Given some UTF-8 text, obtain a
 245--    character value with g_utf8_get_char().
 246
 247--    c :       a Unicode character
 248--    Returns : TRUE if c is a digit
 249
 250--    ---------------------------------------------------------------------------------
 251
 252--   g_unichar_isgraph ()
 253
 254--  gboolean    g_unichar_isgraph               (gunichar c);
 255
 256--    Determines whether a character is printable and not a space (returns FALSE for
 257--    control characters, format characters, and spaces). g_unichar_isprint() is
 258--    similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character
 259--    value with g_utf8_get_char().
 260
 261--    c :       a Unicode character
 262--    Returns : TRUE if c is printable unless it's a space
 263
 264--    ---------------------------------------------------------------------------------
 265
 266--   g_unichar_islower ()
 267
 268--  gboolean    g_unichar_islower               (gunichar c);
 269
 270--    Determines whether a character is a lowercase letter. Given some UTF-8 text,
 271--    obtain a character value with g_utf8_get_char().
 272
 273--    c :       a Unicode character
 274--    Returns : TRUE if c is a lowercase letter
 275
 276--    ---------------------------------------------------------------------------------
 277
 278--   g_unichar_isprint ()
 279
 280--  gboolean    g_unichar_isprint               (gunichar c);
 281
 282--    Determines whether a character is printable. Unlike g_unichar_isgraph(), returns
 283--    TRUE for spaces. Given some UTF-8 text, obtain a character value with
 284--    g_utf8_get_char().
 285
 286--    c :       a Unicode character
 287--    Returns : TRUE if c is printable
 288
 289--    ---------------------------------------------------------------------------------
 290
 291--   g_unichar_ispunct ()
 292
 293--  gboolean    g_unichar_ispunct               (gunichar c);
 294
 295--    Determines whether a character is punctuation or a symbol. Given some UTF-8 text,
 296--    obtain a character value with g_utf8_get_char().
 297
 298--    c :       a Unicode character
 299--    Returns : TRUE if c is a punctuation or symbol character
 300
 301--    ---------------------------------------------------------------------------------
 302
 303--   g_unichar_isspace ()
 304
 305--  gboolean    g_unichar_isspace               (gunichar c);
 306
 307--    Determines whether a character is a space, tab, or line separator (newline,
 308--    carriage return, etc.). Given some UTF-8 text, obtain a character value with
 309--    g_utf8_get_char().
 310
 311--    (Note: don't use this to do word breaking; you have to use Pango or equivalent to
 312--    get word breaking right, the algorithm is fairly complex.)
 313
 314--    c :       a Unicode character
 315--    Returns : TRUE if c is a space character
 316
 317--    ---------------------------------------------------------------------------------
 318
 319--   g_unichar_isupper ()
 320
 321--  gboolean    g_unichar_isupper               (gunichar c);
 322
 323--    Determines if a character is uppercase.
 324
 325--    c :       a Unicode character
 326--    Returns : TRUE if c is an uppercase character
 327
 328--    ---------------------------------------------------------------------------------
 329
 330--   g_unichar_isxdigit ()
 331
 332--  gboolean    g_unichar_isxdigit              (gunichar c);
 333
 334--    Determines if a character is a hexidecimal digit.
 335
 336--    c :       a Unicode character.
 337--    Returns : TRUE if the character is a hexadecimal digit
 338
 339--    ---------------------------------------------------------------------------------
 340
 341--   g_unichar_istitle ()
 342
 343--  gboolean    g_unichar_istitle               (gunichar c);
 344
 345--    Determines if a character is titlecase. Some characters in Unicode which are
 346--    composites, such as the DZ digraph have three case variants instead of just two.
 347--    The titlecase form is used at the beginning of a word where only the first letter
 348--    is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL
 349--    LETTTER D WITH SMALL LETTER Z.
 350
 351--    c :       a Unicode character
 352--    Returns : TRUE if the character is titlecase
 353
 354--    ---------------------------------------------------------------------------------
 355
 356--   g_unichar_isdefined ()
 357
 358--  gboolean    g_unichar_isdefined             (gunichar c);
 359
 360--    Determines if a given character is assigned in the Unicode standard.
 361
 362--    c :       a Unicode character
 363--    Returns : TRUE if the character has an assigned value
 364
 365--    ---------------------------------------------------------------------------------
 366
 367--   g_unichar_iswide ()
 368
 369--  gboolean    g_unichar_iswide                (gunichar c);
 370
 371--    Determines if a character is typically rendered in a double-width cell.
 372
 373--    c :       a Unicode character
 374--    Returns : TRUE if the character is wide
 375
 376--    ---------------------------------------------------------------------------------
 377
 378--   g_unichar_iswide_cjk ()
 379
 380--  gboolean    g_unichar_iswide_cjk            (gunichar c);
 381
 382--    Determines if a character is typically rendered in a double-width cell under
 383--    legacy East Asian locales. If a character is wide according to
 384--    g_unichar_iswide(), then it is also reported wide with this function, but the
 385--    converse is not necessarily true. See the Unicode Standard Annex 11 for details.
 386
 387--    c :       a Unicode character
 388--    Returns : TRUE if the character is wide in legacy East Asian locales
 389
 390--    Since 2.12
 391
 392--    ---------------------------------------------------------------------------------
 393
 394--   g_unichar_toupper ()
 395
 396--  gunichar    g_unichar_toupper               (gunichar c);
 397
 398--    Converts a character to uppercase.
 399
 400--    c :       a Unicode character
 401--    Returns : the result of converting c to uppercase. If c is not an lowercase or
 402--              titlecase character, or has no upper case equivalent c is returned
 403--              unchanged.
 404
 405--    ---------------------------------------------------------------------------------
 406
 407--   g_unichar_tolower ()
 408
 409--  gunichar    g_unichar_tolower               (gunichar c);
 410
 411--    Converts a character to lower case.
 412
 413--    c :       a Unicode character.
 414--    Returns : the result of converting c to lower case. If c is not an upperlower or
 415--              titlecase character, or has no lowercase equivalent c is returned
 416--              unchanged.
 417
 418--    ---------------------------------------------------------------------------------
 419
 420--   g_unichar_totitle ()
 421
 422--  gunichar    g_unichar_totitle               (gunichar c);
 423
 424--    Converts a character to the titlecase.
 425
 426--    c :       a Unicode character
 427--    Returns : the result of converting c to titlecase. If c is not an uppercase or
 428--              lowercase character, c is returned unchanged.
 429
 430--    ---------------------------------------------------------------------------------
 431
 432--   g_unichar_digit_value ()
 433
 434--  gint        g_unichar_digit_value           (gunichar c);
 435
 436--    Determines the numeric value of a character as a decimal digit.
 437
 438--    c :       a Unicode character
 439--    Returns : If c is a decimal digit (according to g_unichar_isdigit()), its numeric
 440--              value. Otherwise, -1.
 441
 442--    ---------------------------------------------------------------------------------
 443
 444--   g_unichar_xdigit_value ()
 445
 446--  gint        g_unichar_xdigit_value          (gunichar c);
 447
 448--    Determines the numeric value of a character as a hexidecimal digit.
 449
 450--    c :       a Unicode character
 451--    Returns : If c is a hex digit (according to g_unichar_isxdigit()), its numeric
 452--              value. Otherwise, -1.
 453
 454--    ---------------------------------------------------------------------------------
 455
 456--   enum GUnicodeType
 457
 458--  typedef enum
 459--  {
 460--    G_UNICODE_CONTROL,
 461--    G_UNICODE_FORMAT,
 462--    G_UNICODE_UNASSIGNED,
 463--    G_UNICODE_PRIVATE_USE,
 464--    G_UNICODE_SURROGATE,
 465--    G_UNICODE_LOWERCASE_LETTER,
 466--    G_UNICODE_MODIFIER_LETTER,
 467--    G_UNICODE_OTHER_LETTER,
 468--    G_UNICODE_TITLECASE_LETTER,
 469--    G_UNICODE_UPPERCASE_LETTER,
 470--    G_UNICODE_COMBINING_MARK,
 471--    G_UNICODE_ENCLOSING_MARK,
 472--    G_UNICODE_NON_SPACING_MARK,
 473--    G_UNICODE_DECIMAL_NUMBER,
 474--    G_UNICODE_LETTER_NUMBER,
 475--    G_UNICODE_OTHER_NUMBER,
 476--    G_UNICODE_CONNECT_PUNCTUATION,
 477--    G_UNICODE_DASH_PUNCTUATION,
 478--    G_UNICODE_CLOSE_PUNCTUATION,
 479--    G_UNICODE_FINAL_PUNCTUATION,
 480--    G_UNICODE_INITIAL_PUNCTUATION,
 481--    G_UNICODE_OTHER_PUNCTUATION,
 482--    G_UNICODE_OPEN_PUNCTUATION,
 483--    G_UNICODE_CURRENCY_SYMBOL,
 484--    G_UNICODE_MODIFIER_SYMBOL,
 485--    G_UNICODE_MATH_SYMBOL,
 486--    G_UNICODE_OTHER_SYMBOL,
 487--    G_UNICODE_LINE_SEPARATOR,
 488--    G_UNICODE_PARAGRAPH_SEPARATOR,
 489--    G_UNICODE_SPACE_SEPARATOR
 490--  } GUnicodeType;
 491
 492--    These are the possible character classifications. See
 493--    http://www.unicode.org/Public/UNIDATA/UnicodeData.html.
 494
 495--    ---------------------------------------------------------------------------------
 496
 497--   g_unichar_type ()
 498
 499--  GUnicodeType g_unichar_type                 (gunichar c);
 500
 501--    Classifies a Unicode character by type.
 502
 503--    c :       a Unicode character
 504--    Returns : the type of the character.
 505
 506--    ---------------------------------------------------------------------------------
 507
 508--   enum GUnicodeBreakType
 509
 510--  typedef enum
 511--  {
 512--    G_UNICODE_BREAK_MANDATORY,
 513--    G_UNICODE_BREAK_CARRIAGE_RETURN,
 514--    G_UNICODE_BREAK_LINE_FEED,
 515--    G_UNICODE_BREAK_COMBINING_MARK,
 516--    G_UNICODE_BREAK_SURROGATE,
 517--    G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
 518--    G_UNICODE_BREAK_INSEPARABLE,
 519--    G_UNICODE_BREAK_NON_BREAKING_GLUE,
 520--    G_UNICODE_BREAK_CONTINGENT,
 521--    G_UNICODE_BREAK_SPACE,
 522--    G_UNICODE_BREAK_AFTER,
 523--    G_UNICODE_BREAK_BEFORE,
 524--    G_UNICODE_BREAK_BEFORE_AND_AFTER,
 525--    G_UNICODE_BREAK_HYPHEN,
 526--    G_UNICODE_BREAK_NON_STARTER,
 527--    G_UNICODE_BREAK_OPEN_PUNCTUATION,
 528--    G_UNICODE_BREAK_CLOSE_PUNCTUATION,
 529--    G_UNICODE_BREAK_QUOTATION,
 530--    G_UNICODE_BREAK_EXCLAMATION,
 531--    G_UNICODE_BREAK_IDEOGRAPHIC,
 532--    G_UNICODE_BREAK_NUMERIC,
 533--    G_UNICODE_BREAK_INFIX_SEPARATOR,
 534--    G_UNICODE_BREAK_SYMBOL,
 535--    G_UNICODE_BREAK_ALPHABETIC,
 536--    G_UNICODE_BREAK_PREFIX,
 537--    G_UNICODE_BREAK_POSTFIX,
 538--    G_UNICODE_BREAK_COMPLEX_CONTEXT,
 539--    G_UNICODE_BREAK_AMBIGUOUS,
 540--    G_UNICODE_BREAK_UNKNOWN,
 541--    G_UNICODE_BREAK_NEXT_LINE,
 542--    G_UNICODE_BREAK_WORD_JOINER,
 543--    G_UNICODE_BREAK_HANGUL_L_JAMO,
 544--    G_UNICODE_BREAK_HANGUL_V_JAMO,
 545--    G_UNICODE_BREAK_HANGUL_T_JAMO,
 546--    G_UNICODE_BREAK_HANGUL_LV_SYLLABLE,
 547--    G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
 548--  } GUnicodeBreakType;
 549
 550--    These are the possible line break classifications. The five Hangul types were
 551--    added in Unicode 4.1, so, has been introduced in GLib 2.10. Note that new types
 552--    may be added in the future. Applications should be ready to handle unknown
 553--    values. They may be regarded as G_UNICODE_BREAK_UNKNOWN. See
 554--    http://www.unicode.org/unicode/reports/tr14/.
 555
 556--    ---------------------------------------------------------------------------------
 557
 558--   g_unichar_break_type ()
 559
 560--  GUnicodeBreakType g_unichar_break_type      (gunichar c);
 561
 562--    Determines the break type of c. c should be a Unicode character (to derive a
 563--    character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used
 564--    to find word and line breaks ("text boundaries"), Pango implements the Unicode
 565--    boundary resolution algorithms and normally you would use a function such as
 566--    pango_break() instead of caring about break types yourself.
 567
 568--    c :       a Unicode character
 569--    Returns : the break type of c
 570
 571--    ---------------------------------------------------------------------------------
 572
 573--   g_unicode_canonical_ordering ()
 574
 575--  void        g_unicode_canonical_ordering    (gunichar *string,
 576--                                               gsize len);
 577
 578--    Computes the canonical ordering of a string in-place. This rearranges decomposed
 579--    characters in the string according to their combining classes. See the Unicode
 580--    manual for more information.
 581
 582--    string : a UCS-4 encoded string.
 583--    len :    the maximum length of string to use.
 584
 585--    ---------------------------------------------------------------------------------
 586
 587--   g_unicode_canonical_decomposition ()
 588
 589--  gunichar*   g_unicode_canonical_decomposition
 590--                                              (gunichar ch,
 591--                                               gsize *result_len);
 592
 593--    Computes the canonical decomposition of a Unicode character.
 594
 595--    ch :         a Unicode character.
 596--    result_len : location to store the length of the return value.
 597--    Returns :    a newly allocated string of Unicode characters. result_len is set to
 598--                 the resulting length of the string.
 599
 600--    ---------------------------------------------------------------------------------
 601
 602--   g_unichar_get_mirror_char ()
 603
 604--  gboolean    g_unichar_get_mirror_char       (gunichar ch,
 605--                                               gunichar *mirrored_ch);
 606
 607--    In Unicode, some characters are mirrored. This means that their images are
 608--    mirrored horizontally in text that is laid out from right to left. For instance,
 609--    "(" would become its mirror image, ")", in right-to-left text.
 610
 611--    If ch has the Unicode mirrored property and there is another unicode character
 612--    that typically has a glyph that is the mirror image of ch's glyph and mirrored_ch
 613--    is set, it puts that character in the address pointed to by mirrored_ch.
 614--    Otherwise the original character is put.
 615
 616--    ch :          a Unicode character
 617--    mirrored_ch : location to store the mirrored character
 618--    Returns :     TRUE if ch has a mirrored character, FALSE otherwise
 619
 620--    Since 2.4
 621
 622--    ---------------------------------------------------------------------------------
 623
 624--   g_utf8_next_char()
 625
 626--  #define     g_utf8_next_char(p)
 627
 628--    Skips to the next character in a UTF-8 string. The string must be valid; this
 629--    macro is as fast as possible, and has no error-checking. You would use this macro
 630--    to iterate over a string character by character. The macro returns the start of
 631--    the next UTF-8 character. Before using this macro, use g_utf8_validate() to
 632--    validate strings that may contain invalid UTF-8.
 633
 634--    p : Pointer to the start of a valid UTF-8 character.
 635
 636--    ---------------------------------------------------------------------------------
 637
 638--   g_utf8_get_char ()
 639
 640--  gunichar    g_utf8_get_char                 (const gchar *p);
 641
 642--    Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does
 643--    not point to a valid UTF-8 encoded character, results are undefined. If you are
 644--    not sure that the bytes are complete valid Unicode characters, you should use
 645--    g_utf8_get_char_validated() instead.
 646
 647--    p :       a pointer to Unicode character encoded as UTF-8
 648--    Returns : the resulting character
 649
 650--    ---------------------------------------------------------------------------------
 651
 652--   g_utf8_get_char_validated ()
 653
 654--  gunichar    g_utf8_get_char_validated       (const gchar *p,
 655--                                               gssize max_len);
 656
 657--    Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This
 658--    function checks for incomplete characters, for invalid characters such as
 659--    characters that are out of the range of Unicode, and for overlong encodings of
 660--    valid characters.
 661
 662--    p :       a pointer to Unicode character encoded as UTF-8
 663--    max_len : the maximum number of bytes to read, or -1, for no maximum.
 664--    Returns : the resulting character. If p points to a partial sequence at the end
 665--              of a string that could begin a valid character, returns (gunichar)-2;
 666--              otherwise, if p does not point to a valid UTF-8 encoded Unicode
 667--              character, returns (gunichar)-1.
 668
 669--    ---------------------------------------------------------------------------------
 670
 671--   g_utf8_offset_to_pointer ()
 672
 673--  gchar*      g_utf8_offset_to_pointer        (const gchar *str,
 674--                                               glong offset);
 675
 676--    Converts from an integer character offset to a pointer to a position within the
 677--    string.
 678
 679--    Since 2.10, this function allows to pass a negative offset to step backwards. It
 680--    is usually worth stepping backwards from the end instead of forwards if offset is
 681--    in the last fourth of the string, since moving forward is about 3 times faster
 682--    than moving backward.
 683
 684--    str :     a UTF-8 encoded string
 685--    offset :  a character offset within str
 686--    Returns : the resulting pointer
 687
 688--    ---------------------------------------------------------------------------------
 689
 690--   g_utf8_pointer_to_offset ()
 691
 692--  glong       g_utf8_pointer_to_offset        (const gchar *str,
 693--                                               const gchar *pos);
 694
 695--    Converts from a pointer to position within a string to a integer character
 696--    offset.
 697
 698--    Since 2.10, this function allows pos to be before str, and returns a negative
 699--    offset in this case.
 700
 701--    str :     a UTF-8 encoded string
 702--    pos :     a pointer to a position within str
 703--    Returns : the resulting character offset
 704
 705--    ---------------------------------------------------------------------------------
 706
 707--   g_utf8_prev_char ()
 708
 709--  gchar*      g_utf8_prev_char                (const gchar *p);
 710
 711--    Finds the previous UTF-8 character in the string before p.
 712
 713--    p does not have to be at the beginning of a UTF-8 character. No check is made to
 714--    see if the character found is actually valid other than it starts with an
 715--    appropriate byte. If p might be the first character of the string, you must use
 716--    g_utf8_find_prev_char() instead.
 717
 718--    p :       a pointer to a position within a UTF-8 encoded string
 719--    Returns : a pointer to the found character.
 720
 721--    ---------------------------------------------------------------------------------
 722
 723--   g_utf8_find_next_char ()
 724
 725--  gchar*      g_utf8_find_next_char           (const gchar *p,
 726--                                               const gchar *end);
 727
 728--    Finds the start of the next UTF-8 character in the string after p.
 729
 730--    p does not have to be at the beginning of a UTF-8 character. No check is made to
 731--    see if the character found is actually valid other than it starts with an
 732--    appropriate byte.
 733
 734--    p :       a pointer to a position within a UTF-8 encoded string
 735--    end :     a pointer to the end of the string, or NULL to indicate that the string
 736--              is nul-terminated, in which case the returned value will be
 737--    Returns : a pointer to the found character or NULL
 738
 739--    ---------------------------------------------------------------------------------
 740
 741--   g_utf8_find_prev_char ()
 742
 743--  gchar*      g_utf8_find_prev_char           (const gchar *str,
 744--                                               const gchar *p);
 745
 746--    Given a position p with a UTF-8 encoded string str, find the start of the
 747--    previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters
 748--    are present in str before p.
 749
 750--    p does not have to be at the beginning of a UTF-8 character. No check is made to
 751--    see if the character found is actually valid other than it starts with an
 752--    appropriate byte.
 753
 754--    str :     pointer to the beginning of a UTF-8 encoded string
 755--    p :       pointer to some position within str
 756--    Returns : a pointer to the found character or NULL.
 757
 758--    ---------------------------------------------------------------------------------
 759
 760--   g_utf8_strlen ()
 761
 762--  glong       g_utf8_strlen                   (const gchar *p,
 763--                                               gssize max);
 764
 765--    Returns the length of the string in characters.
 766
 767--    p :       pointer to the start of a UTF-8 encoded string.
 768--    max :     the maximum number of bytes to examine. If max is less than 0, then the
 769--              string is assumed to be nul-terminated. If max is 0, p will not be
 770--              examined and may be NULL.
 771--    Returns : the length of the string in characters
 772
 773--    ---------------------------------------------------------------------------------
 774
 775--   g_utf8_strncpy ()
 776
 777--  gchar*      g_utf8_strncpy                  (gchar *dest,
 778--                                               const gchar *src,
 779--                                               gsize n);
 780
 781--    Like the standard C strncpy() function, but copies a given number of characters
 782--    instead of a given number of bytes. The src string must be valid UTF-8 encoded
 783--    text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility
 784--    functions with it.)
 785
 786--    dest :    buffer to fill with characters from src
 787--    src :     UTF-8 encoded string
 788--    n :       character count
 789--    Returns : dest
 790
 791--    ---------------------------------------------------------------------------------
 792
 793--   g_utf8_strchr ()
 794
 795--  gchar*      g_utf8_strchr                   (const gchar *p,
 796--                                               gssize len,
 797--                                               gunichar c);
 798
 799--    Finds the leftmost occurrence of the given Unicode character in a UTF-8 encoded
 800--    string, while limiting the search to len bytes. If len is -1, allow unbounded
 801--    search.
 802
 803--    p :       a nul-terminated UTF-8 encoded string
 804--    len :     the maximum length of p
 805--    c :       a Unicode character
 806--    Returns : NULL if the string does not contain the character, otherwise, a pointer
 807--              to the start of the leftmost occurrence of the character in the string.
 808
 809--    ---------------------------------------------------------------------------------
 810
 811--   g_utf8_strrchr ()
 812
 813--  gchar*      g_utf8_strrchr                  (const gchar *p,
 814--                                               gssize len,
 815--                                               gunichar c);
 816
 817--    Find the rightmost occurrence of the given Unicode character in a UTF-8 encoded
 818--    string, while limiting the search to len bytes. If len is -1, allow unbounded
 819--    search.
 820
 821--    p :       a nul-terminated UTF-8 encoded string
 822--    len :     the maximum length of p
 823--    c :       a Unicode character
 824--    Returns : NULL if the string does not contain the character, otherwise, a pointer
 825--              to the start of the rightmost occurrence of the character in the
 826--              string.
 827
 828--    ---------------------------------------------------------------------------------
 829
 830--   g_utf8_strreverse ()
 831
 832--  gchar*      g_utf8_strreverse               (const gchar *str,
 833--                                               gssize len);
 834
 835--    Reverses a UTF-8 string. str must be valid UTF-8 encoded text. (Use
 836--    g_utf8_validate() on all text before trying to use UTF-8 utility functions with
 837--    it.)
 838
 839--    Note that unlike g_strreverse(), this function returns newly-allocated memory,
 840--    which should be freed with g_free() when no longer needed.
 841
 842--    str :     a UTF-8 encoded string
 843--    len :     the maximum length of str to use. If len < 0, then the string is
 844--              nul-terminated.
 845--    Returns : a newly-allocated string which is the reverse of str.
 846
 847--    Since 2.2
 848
 849--    ---------------------------------------------------------------------------------
 850
 851--   g_utf8_validate ()
 852
 853--  gboolean    g_utf8_validate                 (const gchar *str,
 854--                                               gssize max_len,
 855--                                               const gchar **end);
 856
 857--    Validates UTF-8 encoded text. str is the text to validate; if str is
 858--    nul-terminated, then max_len can be -1, otherwise max_len should be the number of
 859--    bytes to validate. If end is non-NULL, then the end of the valid range will be
 860--    stored there (i.e. the start of the first invalid character if some bytes were
 861--    invalid, or the end of the text being validated otherwise).
 862
 863--    Note that g_utf8_validate() returns FALSE if max_len is positive and NUL is met
 864--    before max_len bytes have been read.
 865
 866--    Returns TRUE if all of str was valid. Many GLib and GTK+ routines require valid
 867--    UTF-8 as input; so data read from a file or the network should be checked with
 868--    g_utf8_validate() before doing anything else with it.
 869
 870--    str :     a pointer to character data
 871--    max_len : max bytes to validate, or -1 to go until NUL
 872--    end :     return location for end of valid data
 873--    Returns : TRUE if the text was valid UTF-8
 874
 875--    ---------------------------------------------------------------------------------
 876
 877--   g_utf8_strup ()
 878
 879--  gchar*      g_utf8_strup                    (const gchar *str,
 880--                                               gssize len);
 881
 882--    Converts all Unicode characters in the string that have a case to uppercase. The
 883--    exact manner that this is done depends on the current locale, and may result in
 884--    the number of characters in the string increasing. (For instance, the German
 885--    ess-zet will be changed to SS.)
 886
 887--    str :     a UTF-8 encoded string
 888--    len :     length of str, in bytes, or -1 if str is nul-terminated.
 889--    Returns : a newly allocated string, with all characters converted to uppercase.
 890
 891--    ---------------------------------------------------------------------------------
 892
 893--   g_utf8_strdown ()
 894
 895--  gchar*      g_utf8_strdown                  (const gchar *str,
 896--                                               gssize len);
 897
 898--    Converts all Unicode characters in the string that have a case to lowercase. The
 899--    exact manner that this is done depends on the current locale, and may result in
 900--    the number of characters in the string changing.
 901
 902--    str :     a UTF-8 encoded string
 903--    len :     length of str, in bytes, or -1 if str is nul-terminated.
 904--    Returns : a newly allocated string, with all characters converted to lowercase.
 905
 906--    ---------------------------------------------------------------------------------
 907
 908--   g_utf8_casefold ()
 909
 910--  gchar*      g_utf8_casefold                 (const gchar *str,
 911--                                               gssize len);
 912
 913--    Converts a string into a form that is independent of case. The result will not
 914--    correspond to any particular case, but can be compared for equality or ordered
 915--    with the results of calling g_utf8_casefold() on other strings.
 916
 917--    Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an
 918--    approximation to the correct linguistic case insensitive ordering, though it is a
 919--    fairly good one. Getting this exactly right would require a more sophisticated
 920--    collation function that takes case sensitivity into account. GLib does not
 921--    currently provide such a function.
 922
 923--    str :     a UTF-8 encoded string
 924--    len :     length of str, in bytes, or -1 if str is nul-terminated.
 925--    Returns : a newly allocated string, that is a case independent form of str.
 926
 927--    ---------------------------------------------------------------------------------
 928
 929--   g_utf8_normalize ()
 930
 931--  gchar*      g_utf8_normalize                (const gchar *str,
 932--                                               gssize len,
 933--                                               GNormalizeMode mode);
 934
 935--    Converts a string into canonical form, standardizing such issues as whether a
 936--    character with an accent is represented as a base character and combining accent
 937--    or as a single precomposed character. You should generally call
 938--    g_utf8_normalize() before comparing two Unicode strings.
 939
 940--    The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do
 941--    not affect the text content, such as the above-mentioned accent representation.
 942--    G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such
 943--    as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting
 944--    information may be lost but for most text operations such characters should be
 945--    considered the same. For example, g_utf8_collate() normalizes with
 946--    G_NORMALIZE_ALL as its first step.
 947
 948--    G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like
 949--    G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed
 950--    forms rather than a maximally decomposed form. This is often useful if you intend
 951--    to convert the string to a legacy encoding or pass it to a system with less
 952--    capable Unicode handling.
 953
 954--    str :     a UTF-8 encoded string.
 955--    len :     length of str, in bytes, or -1 if str is nul-terminated.
 956--    mode :    the type of normalization to perform.
 957--    Returns : a newly allocated string, that is the normalized form of str.
 958
 959--    ---------------------------------------------------------------------------------
 960
 961--   enum GNormalizeMode
 962
 963--  typedef enum {
 964--    G_NORMALIZE_DEFAULT,
 965--    G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
 966--    G_NORMALIZE_DEFAULT_COMPOSE,
 967--    G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
 968--    G_NORMALIZE_ALL,
 969--    G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
 970--    G_NORMALIZE_ALL_COMPOSE,
 971--    G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
 972--  } GNormalizeMode;
 973
 974--    Defines how a Unicode string is transformed in a canonical form, standardizing
 975--    such issues as whether a character with an accent is represented as a base
 976--    character and combining accent or as a single precomposed character. Unicode
 977--    strings should generally be normalized before comparing them.
 978
 979--    G_NORMALIZE_DEFAULT         standardize differences that do not affect the text
 980--                                content, such as the above-mentioned accent
 981--                                representation.
 982--    G_NORMALIZE_NFD             another name for G_NORMALIZE_DEFAULT.
 983--    G_NORMALIZE_DEFAULT_COMPOSE like G_NORMALIZE_DEFAULT, but with composed forms
 984--                                rather than a maximally decomposed form.
 985--    G_NORMALIZE_NFC             another name for G_NORMALIZE_DEFAULT_COMPOSE.
 986--    G_NORMALIZE_ALL             beyond G_NORMALIZE_DEFAULT also standardize the
 987--                                "compatibility" characters in Unicode, such as
 988--                                SUPERSCRIPT THREE to the standard forms (in this case
 989--                                DIGIT THREE). Formatting information may be lost but
 990--                                for most text operations such characters should be
 991--                                considered the same.
 992--    G_NORMALIZE_NFKD            another name for G_NORMALIZE_ALL.
 993--    G_NORMALIZE_ALL_COMPOSE     like G_NORMALIZE_ALL, but with composed forms rather
 994--                                than a maximally decomposed form.
 995--    G_NORMALIZE_NFKC            another name for G_NORMALIZE_ALL_COMPOSE.
 996
 997--    ---------------------------------------------------------------------------------
 998
 999--   g_utf8_collate ()
1000
1001--  gint        g_utf8_collate                  (const gchar *str1,
1002--                                               const gchar *str2);
1003
1004--    Compares two strings for ordering using the linguistically correct rules for the
1005--    current locale. When sorting a large number of strings, it will be significantly
1006--    faster to obtain collation keys with g_utf8_collate_key() and compare the keys
1007--    with strcmp() when sorting instead of sorting the original strings.
1008
1009--    str1 :    a UTF-8 encoded string
1010--    str2 :    a UTF-8 encoded string
1011--    Returns : < 0 if str1 compares before str2, 0 if they compare equal, > 0 if str1
1012--              compares after str2.
1013
1014--    ---------------------------------------------------------------------------------
1015
1016--   g_utf8_collate_key ()
1017
1018--  gchar*      g_utf8_collate_key              (const gchar *str,
1019--                                               gssize len);
1020
1021--    Converts a string into a collation key that can be compared with other collation
1022--    keys produced by the same function using strcmp(). The results of comparing the
1023--    collation keys of two strings with strcmp() will always be the same as comparing
1024--    the two original keys with g_utf8_collate().
1025
1026--    str :     a UTF-8 encoded string.
1027--    len :     length of str, in bytes, or -1 if str is nul-terminated.
1028--    Returns : a newly allocated string. This string should be freed with g_free()
1029--              when you are done with it.
1030
1031--    ---------------------------------------------------------------------------------
1032
1033--   g_utf8_collate_key_for_filename ()
1034
1035--  gchar*      g_utf8_collate_key_for_filename (const gchar *str,
1036--                                               gssize len);
1037
1038--    Converts a string into a collation key that can be compared with other collation
1039--    keys produced by the same function using strcmp().
1040
1041--    In order to sort filenames correctly, this function treats the dot '.' as a
1042--    special case. Most dictionary orderings seem to consider it insignificant, thus
1043--    producing the ordering "event.c" "eventgenerator.c" "event.h" instead of
1044--    "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers
1045--    intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5"
1046--    "file10".
1047
1048--    str :     a UTF-8 encoded string.
1049--    len :     length of str, in bytes, or -1 if str is nul-terminated.
1050--    Returns : a newly allocated string. This string should be freed with g_free()
1051--              when you are done with it.
1052
1053--    Since 2.8
1054
1055--    ---------------------------------------------------------------------------------
1056
1057--   g_utf8_to_utf16 ()
1058
1059--  gunichar2*  g_utf8_to_utf16                 (const gchar *str,
1060--                                               glong len,
1061--                                               glong *items_read,
1062--                                               glong *items_written,
1063--                                               GError **error);
1064
1065--    Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result
1066--    after the converted text.
1067
1068--    str :           a UTF-8 encoded string
1069--    len :           the maximum length (number of characters) of str to use. If len <
1070--                    0, then the string is nul-terminated.
1071--    items_read :    location to store number of bytes read, or NULL. If NULL, then
1072--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
1073--                    contains a trailing partial character. If an error occurs then
1074--                    the index of the invalid input is stored here.
1075--    items_written : location to store number of gunichar2 written, or NULL. The value
1076--                    stored here does not include the trailing 0.
1077--    error :         location to store the error occuring, or NULL to ignore errors.
1078--                    Any of the errors in GConvertError other than
1079--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
1080--    Returns :       a pointer to a newly allocated UTF-16 string. This value must be
1081--                    freed with g_free(). If an error occurs, NULL will be returned
1082--                    and error set.
1083
1084--    ---------------------------------------------------------------------------------
1085
1086--   g_utf8_to_ucs4 ()
1087
1088--  gunichar*   g_utf8_to_ucs4                  (const gchar *str,
1089--                                               glong len,
1090--                                               glong *items_read,
1091--                                               glong *items_written,
1092--                                               GError **error);
1093
1094--    Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A
1095--    trailing 0 will be added to the string after the converted text.
1096
1097--    str :           a UTF-8 encoded string
1098--    len :           the maximum length of str to use. If len < 0, then the string is
1099--                    nul-terminated.
1100--    items_read :    location to store number of bytes read, or NULL. If NULL, then
1101--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
1102--                    contains a trailing partial character. If an error occurs then
1103--                    the index of the invalid input is stored here.
1104--    items_written : location to store number of characters written or NULL. The value
1105--                    here stored does not include the trailing 0 character.
1106--    error :         location to store the error occuring, or NULL to ignore errors.
1107--                    Any of the errors in GConvertError other than
1108--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
1109--    Returns :       a pointer to a newly allocated UCS-4 string. This value must be
1110--                    freed with g_free(). If an error occurs, NULL will be returned
1111--                    and error set.
1112
1113--    ---------------------------------------------------------------------------------
1114
1115--   g_utf8_to_ucs4_fast ()
1116
1117--  gunichar*   g_utf8_to_ucs4_fast             (const gchar *str,
1118--                                               glong len,
1119--                                               glong *items_written);
1120
1121--    Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4,
1122--    assuming valid UTF-8 input. This function is roughly twice as fast as
1123--    g_utf8_to_ucs4() but does no error checking on the input.
1124
1125--    str :           a UTF-8 encoded string
1126--    len :           the maximum length of str to use. If len < 0, then the string is
1127--                    nul-terminated.
1128--    items_written : location to store the number of characters in the result, or
1129--                    NULL.
1130--    Returns :       a pointer to a newly allocated UCS-4 string. This value must be
1131--                    freed with g_free().
1132
1133--    ---------------------------------------------------------------------------------
1134
1135--   g_utf16_to_ucs4 ()
1136
1137--  gunichar*   g_utf16_to_ucs4                 (const gunichar2 *str,
1138--                                               glong len,
1139--                                               glong *items_read,
1140--                                               glong *items_written,
1141--                                               GError **error);
1142
1143--    Convert a string from UTF-16 to UCS-4. The result will be terminated with a 0
1144--    character.
1145
1146--    str :           a UTF-16 encoded string
1147--    len :           the maximum length (number of gunichar2) of str to use. If len <
1148

Large files files are truncated, but you can click here to view the full file