glib_unicode_manipulation.e

/src/wrappers/glib/library/utilities/glib_unicode_manipulation.e

http://github.com/tybor/Liberty
Specman e | 1278 lines | 35 code | 338 blank | 905 comment | 2 complexity | b03c41098c1a40f12d152ad7cc22961f MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, GPL-2.0

indexing
	description: "C string Utility Functions -- various C-string-related functions."
	copyright: "[
					Copyright (C) 2007 Paolo Redaelli, Anthony Lenton,
					                   Soluciones Informaticas Libres S.A., GLib team
					
					This library is free software; you can redistribute it and/or
					modify it under the terms of the GNU Lesser General Public License
					as published by the Free Software Foundation; either version 2.1 of
					the License, or (at your option) any later version.
					
					This library is distributed in the hopeOA that it will be useful, but
					WITHOUT ANY WARRANTY; without even the implied warranty of
					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
					Lesser General Public License for more details.

					You should have received a copy of the GNU Lesser General Public
					License along with this library; if not, write to the Free Software
					Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
					02110-1301 USA
			]"

deferred class GLIB_UNICODE_MANIPULATION

insert
	ANY undefine is_equal, copy end

feature {} -- Utility functions, inherit them if you need them

--    Unicode Manipulation -- functions operating on Unicode characters and UTF-8
--    strings.

-- Synopsis


--  #include <glib.h>


--  typedef     gunichar;
--  typedef     gunichar2;

--  gboolean    g_unichar_validate              (gunichar ch);
--  gboolean    g_unichar_isalnum               (gunichar c);
--  gboolean    g_unichar_isalpha               (gunichar c);
--  gboolean    g_unichar_iscntrl               (gunichar c);
--  gboolean    g_unichar_isdigit               (gunichar c);
--  gboolean    g_unichar_isgraph               (gunichar c);
--  gboolean    g_unichar_islower               (gunichar c);
--  gboolean    g_unichar_isprint               (gunichar c);
--  gboolean    g_unichar_ispunct               (gunichar c);
--  gboolean    g_unichar_isspace               (gunichar c);
--  gboolean    g_unichar_isupper               (gunichar c);
--  gboolean    g_unichar_isxdigit              (gunichar c);
--  gboolean    g_unichar_istitle               (gunichar c);
--  gboolean    g_unichar_isdefined             (gunichar c);
--  gboolean    g_unichar_iswide                (gunichar c);
--  gboolean    g_unichar_iswide_cjk            (gunichar c);
--  gunichar    g_unichar_toupper               (gunichar c);
--  gunichar    g_unichar_tolower               (gunichar c);
--  gunichar    g_unichar_totitle               (gunichar c);
--  gint        g_unichar_digit_value           (gunichar c);
--  gint        g_unichar_xdigit_value          (gunichar c);
--  enum        GUnicodeType;
--  GUnicodeType g_unichar_type                 (gunichar c);
--  enum        GUnicodeBreakType;
--  GUnicodeBreakType g_unichar_break_type      (gunichar c);
--  void        g_unicode_canonical_ordering    (gunichar *string,
--                                               gsize len);
--  gunichar*   g_unicode_canonical_decomposition
--                                              (gunichar ch,
--                                               gsize *result_len);
--  gboolean    g_unichar_get_mirror_char       (gunichar ch,
--                                               gunichar *mirrored_ch);

--  #define     g_utf8_next_char                (p)
--  gunichar    g_utf8_get_char                 (const gchar *p);
--  gunichar    g_utf8_get_char_validated       (const gchar *p,
--                                               gssize max_len);
--  gchar*      g_utf8_offset_to_pointer        (const gchar *str,
--                                               glong offset);
--  glong       g_utf8_pointer_to_offset        (const gchar *str,
--                                               const gchar *pos);
--  gchar*      g_utf8_prev_char                (const gchar *p);
--  gchar*      g_utf8_find_next_char           (const gchar *p,
--                                               const gchar *end);
--  gchar*      g_utf8_find_prev_char           (const gchar *str,
--                                               const gchar *p);
--  glong       g_utf8_strlen                   (const gchar *p,
--                                               gssize max);
--  gchar*      g_utf8_strncpy                  (gchar *dest,
--                                               const gchar *src,
--                                               gsize n);
--  gchar*      g_utf8_strchr                   (const gchar *p,
--                                               gssize len,
--                                               gunichar c);
--  gchar*      g_utf8_strrchr                  (const gchar *p,
--                                               gssize len,
--                                               gunichar c);
--  gchar*      g_utf8_strreverse               (const gchar *str,
--                                               gssize len);
--  gboolean    g_utf8_validate                 (const gchar *str,
--                                               gssize max_len,
--                                               const gchar **end);

--  gchar*      g_utf8_strup                    (const gchar *str,
--                                               gssize len);
--  gchar*      g_utf8_strdown                  (const gchar *str,
--                                               gssize len);
--  gchar*      g_utf8_casefold                 (const gchar *str,
--                                               gssize len);
--  gchar*      g_utf8_normalize                (const gchar *str,
--                                               gssize len,
--                                               GNormalizeMode mode);
--  enum        GNormalizeMode;
--  gint        g_utf8_collate                  (const gchar *str1,
--                                               const gchar *str2);
--  gchar*      g_utf8_collate_key              (const gchar *str,
--                                               gssize len);
--  gchar*      g_utf8_collate_key_for_filename (const gchar *str,
--                                               gssize len);

--  gunichar2*  g_utf8_to_utf16                 (const gchar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gunichar*   g_utf8_to_ucs4                  (const gchar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gunichar*   g_utf8_to_ucs4_fast             (const gchar *str,
--                                               glong len,
--                                               glong *items_written);
--  gunichar*   g_utf16_to_ucs4                 (const gunichar2 *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gchar*      g_utf16_to_utf8                 (const gunichar2 *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gunichar2*  g_ucs4_to_utf16                 (const gunichar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gchar*      g_ucs4_to_utf8                  (const gunichar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);
--  gint        g_unichar_to_utf8               (gunichar c,
--                                               gchar *outbuf);

-- Description

--    This section describes a number of functions for dealing with Unicode characters
--    and strings. There are analogues of the traditional ctype.h character
--    classification and case conversion functions, UTF-8 analogues of some string
--    utility functions, functions to perform normalization, case conversion and
--    collation on UTF-8 strings and finally functions to convert between the UTF-8,
--    UTF-16 and UCS-4 encodings of Unicode.

--    The implementations of the Unicode functions in GLib are based on the Unicode
--    Character Data tables, which are available from www.unicode.org. GLib 2.8
--    supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode
--    5.0.

-- Details

--   gunichar

--  typedef guint32 gunichar;

--    A type which can hold any UCS-4 character code.

--    ---------------------------------------------------------------------------------

--   gunichar2

--  typedef guint16 gunichar2;

--    A type which can hold any UTF-16 code point^[3].

--    ---------------------------------------------------------------------------------

--   g_unichar_validate ()

--  gboolean    g_unichar_validate              (gunichar ch);

--    Checks whether ch is a valid Unicode character. Some possible integer values of
--    ch will not be valid. 0 is considered a valid character, though it's normally a
--    string terminator.

--    ch :      a Unicode character
--    Returns : TRUE if ch is a valid Unicode character

--    ---------------------------------------------------------------------------------

--   g_unichar_isalnum ()

--  gboolean    g_unichar_isalnum               (gunichar c);

--    Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a
--    character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is an alphanumeric character

--    ---------------------------------------------------------------------------------

--   g_unichar_isalpha ()

--  gboolean    g_unichar_isalpha               (gunichar c);

--    Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8
--    text, obtain a character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is an alphabetic character

--    ---------------------------------------------------------------------------------

--   g_unichar_iscntrl ()

--  gboolean    g_unichar_iscntrl               (gunichar c);

--    Determines whether a character is a control character. Given some UTF-8 text,
--    obtain a character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is a control character

--    ---------------------------------------------------------------------------------

--   g_unichar_isdigit ()

--  gboolean    g_unichar_isdigit               (gunichar c);

--    Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9
--    and also digits in other languages/scripts. Given some UTF-8 text, obtain a
--    character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is a digit

--    ---------------------------------------------------------------------------------

--   g_unichar_isgraph ()

--  gboolean    g_unichar_isgraph               (gunichar c);

--    Determines whether a character is printable and not a space (returns FALSE for
--    control characters, format characters, and spaces). g_unichar_isprint() is
--    similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character
--    value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is printable unless it's a space

--    ---------------------------------------------------------------------------------

--   g_unichar_islower ()

--  gboolean    g_unichar_islower               (gunichar c);

--    Determines whether a character is a lowercase letter. Given some UTF-8 text,
--    obtain a character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is a lowercase letter

--    ---------------------------------------------------------------------------------

--   g_unichar_isprint ()

--  gboolean    g_unichar_isprint               (gunichar c);

--    Determines whether a character is printable. Unlike g_unichar_isgraph(), returns
--    TRUE for spaces. Given some UTF-8 text, obtain a character value with
--    g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is printable

--    ---------------------------------------------------------------------------------

--   g_unichar_ispunct ()

--  gboolean    g_unichar_ispunct               (gunichar c);

--    Determines whether a character is punctuation or a symbol. Given some UTF-8 text,
--    obtain a character value with g_utf8_get_char().

--    c :       a Unicode character
--    Returns : TRUE if c is a punctuation or symbol character

--    ---------------------------------------------------------------------------------

--   g_unichar_isspace ()

--  gboolean    g_unichar_isspace               (gunichar c);

--    Determines whether a character is a space, tab, or line separator (newline,
--    carriage return, etc.). Given some UTF-8 text, obtain a character value with
--    g_utf8_get_char().

--    (Note: don't use this to do word breaking; you have to use Pango or equivalent to
--    get word breaking right, the algorithm is fairly complex.)

--    c :       a Unicode character
--    Returns : TRUE if c is a space character

--    ---------------------------------------------------------------------------------

--   g_unichar_isupper ()

--  gboolean    g_unichar_isupper               (gunichar c);

--    Determines if a character is uppercase.

--    c :       a Unicode character
--    Returns : TRUE if c is an uppercase character

--    ---------------------------------------------------------------------------------

--   g_unichar_isxdigit ()

--  gboolean    g_unichar_isxdigit              (gunichar c);

--    Determines if a character is a hexidecimal digit.

--    c :       a Unicode character.
--    Returns : TRUE if the character is a hexadecimal digit

--    ---------------------------------------------------------------------------------

--   g_unichar_istitle ()

--  gboolean    g_unichar_istitle               (gunichar c);

--    Determines if a character is titlecase. Some characters in Unicode which are
--    composites, such as the DZ digraph have three case variants instead of just two.
--    The titlecase form is used at the beginning of a word where only the first letter
--    is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL
--    LETTTER D WITH SMALL LETTER Z.

--    c :       a Unicode character
--    Returns : TRUE if the character is titlecase

--    ---------------------------------------------------------------------------------

--   g_unichar_isdefined ()

--  gboolean    g_unichar_isdefined             (gunichar c);

--    Determines if a given character is assigned in the Unicode standard.

--    c :       a Unicode character
--    Returns : TRUE if the character has an assigned value

--    ---------------------------------------------------------------------------------

--   g_unichar_iswide ()

--  gboolean    g_unichar_iswide                (gunichar c);

--    Determines if a character is typically rendered in a double-width cell.

--    c :       a Unicode character
--    Returns : TRUE if the character is wide

--    ---------------------------------------------------------------------------------

--   g_unichar_iswide_cjk ()

--  gboolean    g_unichar_iswide_cjk            (gunichar c);

--    Determines if a character is typically rendered in a double-width cell under
--    legacy East Asian locales. If a character is wide according to
--    g_unichar_iswide(), then it is also reported wide with this function, but the
--    converse is not necessarily true. See the Unicode Standard Annex 11 for details.

--    c :       a Unicode character
--    Returns : TRUE if the character is wide in legacy East Asian locales

--    Since 2.12

--    ---------------------------------------------------------------------------------

--   g_unichar_toupper ()

--  gunichar    g_unichar_toupper               (gunichar c);

--    Converts a character to uppercase.

--    c :       a Unicode character
--    Returns : the result of converting c to uppercase. If c is not an lowercase or
--              titlecase character, or has no upper case equivalent c is returned
--              unchanged.

--    ---------------------------------------------------------------------------------

--   g_unichar_tolower ()

--  gunichar    g_unichar_tolower               (gunichar c);

--    Converts a character to lower case.

--    c :       a Unicode character.
--    Returns : the result of converting c to lower case. If c is not an upperlower or
--              titlecase character, or has no lowercase equivalent c is returned
--              unchanged.

--    ---------------------------------------------------------------------------------

--   g_unichar_totitle ()

--  gunichar    g_unichar_totitle               (gunichar c);

--    Converts a character to the titlecase.

--    c :       a Unicode character
--    Returns : the result of converting c to titlecase. If c is not an uppercase or
--              lowercase character, c is returned unchanged.

--    ---------------------------------------------------------------------------------

--   g_unichar_digit_value ()

--  gint        g_unichar_digit_value           (gunichar c);

--    Determines the numeric value of a character as a decimal digit.

--    c :       a Unicode character
--    Returns : If c is a decimal digit (according to g_unichar_isdigit()), its numeric
--              value. Otherwise, -1.

--    ---------------------------------------------------------------------------------

--   g_unichar_xdigit_value ()

--  gint        g_unichar_xdigit_value          (gunichar c);

--    Determines the numeric value of a character as a hexidecimal digit.

--    c :       a Unicode character
--    Returns : If c is a hex digit (according to g_unichar_isxdigit()), its numeric
--              value. Otherwise, -1.

--    ---------------------------------------------------------------------------------

--   enum GUnicodeType

--  typedef enum
--  {
--    G_UNICODE_CONTROL,
--    G_UNICODE_FORMAT,
--    G_UNICODE_UNASSIGNED,
--    G_UNICODE_PRIVATE_USE,
--    G_UNICODE_SURROGATE,
--    G_UNICODE_LOWERCASE_LETTER,
--    G_UNICODE_MODIFIER_LETTER,
--    G_UNICODE_OTHER_LETTER,
--    G_UNICODE_TITLECASE_LETTER,
--    G_UNICODE_UPPERCASE_LETTER,
--    G_UNICODE_COMBINING_MARK,
--    G_UNICODE_ENCLOSING_MARK,
--    G_UNICODE_NON_SPACING_MARK,
--    G_UNICODE_DECIMAL_NUMBER,
--    G_UNICODE_LETTER_NUMBER,
--    G_UNICODE_OTHER_NUMBER,
--    G_UNICODE_CONNECT_PUNCTUATION,
--    G_UNICODE_DASH_PUNCTUATION,
--    G_UNICODE_CLOSE_PUNCTUATION,
--    G_UNICODE_FINAL_PUNCTUATION,
--    G_UNICODE_INITIAL_PUNCTUATION,
--    G_UNICODE_OTHER_PUNCTUATION,
--    G_UNICODE_OPEN_PUNCTUATION,
--    G_UNICODE_CURRENCY_SYMBOL,
--    G_UNICODE_MODIFIER_SYMBOL,
--    G_UNICODE_MATH_SYMBOL,
--    G_UNICODE_OTHER_SYMBOL,
--    G_UNICODE_LINE_SEPARATOR,
--    G_UNICODE_PARAGRAPH_SEPARATOR,
--    G_UNICODE_SPACE_SEPARATOR
--  } GUnicodeType;

--    These are the possible character classifications. See
--    http://www.unicode.org/Public/UNIDATA/UnicodeData.html.

--    ---------------------------------------------------------------------------------

--   g_unichar_type ()

--  GUnicodeType g_unichar_type                 (gunichar c);

--    Classifies a Unicode character by type.

--    c :       a Unicode character
--    Returns : the type of the character.

--    ---------------------------------------------------------------------------------

--   enum GUnicodeBreakType

--  typedef enum
--  {
--    G_UNICODE_BREAK_MANDATORY,
--    G_UNICODE_BREAK_CARRIAGE_RETURN,
--    G_UNICODE_BREAK_LINE_FEED,
--    G_UNICODE_BREAK_COMBINING_MARK,
--    G_UNICODE_BREAK_SURROGATE,
--    G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
--    G_UNICODE_BREAK_INSEPARABLE,
--    G_UNICODE_BREAK_NON_BREAKING_GLUE,
--    G_UNICODE_BREAK_CONTINGENT,
--    G_UNICODE_BREAK_SPACE,
--    G_UNICODE_BREAK_AFTER,
--    G_UNICODE_BREAK_BEFORE,
--    G_UNICODE_BREAK_BEFORE_AND_AFTER,
--    G_UNICODE_BREAK_HYPHEN,
--    G_UNICODE_BREAK_NON_STARTER,
--    G_UNICODE_BREAK_OPEN_PUNCTUATION,
--    G_UNICODE_BREAK_CLOSE_PUNCTUATION,
--    G_UNICODE_BREAK_QUOTATION,
--    G_UNICODE_BREAK_EXCLAMATION,
--    G_UNICODE_BREAK_IDEOGRAPHIC,
--    G_UNICODE_BREAK_NUMERIC,
--    G_UNICODE_BREAK_INFIX_SEPARATOR,
--    G_UNICODE_BREAK_SYMBOL,
--    G_UNICODE_BREAK_ALPHABETIC,
--    G_UNICODE_BREAK_PREFIX,
--    G_UNICODE_BREAK_POSTFIX,
--    G_UNICODE_BREAK_COMPLEX_CONTEXT,
--    G_UNICODE_BREAK_AMBIGUOUS,
--    G_UNICODE_BREAK_UNKNOWN,
--    G_UNICODE_BREAK_NEXT_LINE,
--    G_UNICODE_BREAK_WORD_JOINER,
--    G_UNICODE_BREAK_HANGUL_L_JAMO,
--    G_UNICODE_BREAK_HANGUL_V_JAMO,
--    G_UNICODE_BREAK_HANGUL_T_JAMO,
--    G_UNICODE_BREAK_HANGUL_LV_SYLLABLE,
--    G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
--  } GUnicodeBreakType;

--    These are the possible line break classifications. The five Hangul types were
--    added in Unicode 4.1, so, has been introduced in GLib 2.10. Note that new types
--    may be added in the future. Applications should be ready to handle unknown
--    values. They may be regarded as G_UNICODE_BREAK_UNKNOWN. See
--    http://www.unicode.org/unicode/reports/tr14/.

--    ---------------------------------------------------------------------------------

--   g_unichar_break_type ()

--  GUnicodeBreakType g_unichar_break_type      (gunichar c);

--    Determines the break type of c. c should be a Unicode character (to derive a
--    character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used
--    to find word and line breaks ("text boundaries"), Pango implements the Unicode
--    boundary resolution algorithms and normally you would use a function such as
--    pango_break() instead of caring about break types yourself.

--    c :       a Unicode character
--    Returns : the break type of c

--    ---------------------------------------------------------------------------------

--   g_unicode_canonical_ordering ()

--  void        g_unicode_canonical_ordering    (gunichar *string,
--                                               gsize len);

--    Computes the canonical ordering of a string in-place. This rearranges decomposed
--    characters in the string according to their combining classes. See the Unicode
--    manual for more information.

--    string : a UCS-4 encoded string.
--    len :    the maximum length of string to use.

--    ---------------------------------------------------------------------------------

--   g_unicode_canonical_decomposition ()

--  gunichar*   g_unicode_canonical_decomposition
--                                              (gunichar ch,
--                                               gsize *result_len);

--    Computes the canonical decomposition of a Unicode character.

--    ch :         a Unicode character.
--    result_len : location to store the length of the return value.
--    Returns :    a newly allocated string of Unicode characters. result_len is set to
--                 the resulting length of the string.

--    ---------------------------------------------------------------------------------

--   g_unichar_get_mirror_char ()

--  gboolean    g_unichar_get_mirror_char       (gunichar ch,
--                                               gunichar *mirrored_ch);

--    In Unicode, some characters are mirrored. This means that their images are
--    mirrored horizontally in text that is laid out from right to left. For instance,
--    "(" would become its mirror image, ")", in right-to-left text.

--    If ch has the Unicode mirrored property and there is another unicode character
--    that typically has a glyph that is the mirror image of ch's glyph and mirrored_ch
--    is set, it puts that character in the address pointed to by mirrored_ch.
--    Otherwise the original character is put.

--    ch :          a Unicode character
--    mirrored_ch : location to store the mirrored character
--    Returns :     TRUE if ch has a mirrored character, FALSE otherwise

--    Since 2.4

--    ---------------------------------------------------------------------------------

--   g_utf8_next_char()

--  #define     g_utf8_next_char(p)

--    Skips to the next character in a UTF-8 string. The string must be valid; this
--    macro is as fast as possible, and has no error-checking. You would use this macro
--    to iterate over a string character by character. The macro returns the start of
--    the next UTF-8 character. Before using this macro, use g_utf8_validate() to
--    validate strings that may contain invalid UTF-8.

--    p : Pointer to the start of a valid UTF-8 character.

--    ---------------------------------------------------------------------------------

--   g_utf8_get_char ()

--  gunichar    g_utf8_get_char                 (const gchar *p);

--    Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does
--    not point to a valid UTF-8 encoded character, results are undefined. If you are
--    not sure that the bytes are complete valid Unicode characters, you should use
--    g_utf8_get_char_validated() instead.

--    p :       a pointer to Unicode character encoded as UTF-8
--    Returns : the resulting character

--    ---------------------------------------------------------------------------------

--   g_utf8_get_char_validated ()

--  gunichar    g_utf8_get_char_validated       (const gchar *p,
--                                               gssize max_len);

--    Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This
--    function checks for incomplete characters, for invalid characters such as
--    characters that are out of the range of Unicode, and for overlong encodings of
--    valid characters.

--    p :       a pointer to Unicode character encoded as UTF-8
--    max_len : the maximum number of bytes to read, or -1, for no maximum.
--    Returns : the resulting character. If p points to a partial sequence at the end
--              of a string that could begin a valid character, returns (gunichar)-2;
--              otherwise, if p does not point to a valid UTF-8 encoded Unicode
--              character, returns (gunichar)-1.

--    ---------------------------------------------------------------------------------

--   g_utf8_offset_to_pointer ()

--  gchar*      g_utf8_offset_to_pointer        (const gchar *str,
--                                               glong offset);

--    Converts from an integer character offset to a pointer to a position within the
--    string.

--    Since 2.10, this function allows to pass a negative offset to step backwards. It
--    is usually worth stepping backwards from the end instead of forwards if offset is
--    in the last fourth of the string, since moving forward is about 3 times faster
--    than moving backward.

--    str :     a UTF-8 encoded string
--    offset :  a character offset within str
--    Returns : the resulting pointer

--    ---------------------------------------------------------------------------------

--   g_utf8_pointer_to_offset ()

--  glong       g_utf8_pointer_to_offset        (const gchar *str,
--                                               const gchar *pos);

--    Converts from a pointer to position within a string to a integer character
--    offset.

--    Since 2.10, this function allows pos to be before str, and returns a negative
--    offset in this case.

--    str :     a UTF-8 encoded string
--    pos :     a pointer to a position within str
--    Returns : the resulting character offset

--    ---------------------------------------------------------------------------------

--   g_utf8_prev_char ()

--  gchar*      g_utf8_prev_char                (const gchar *p);

--    Finds the previous UTF-8 character in the string before p.

--    p does not have to be at the beginning of a UTF-8 character. No check is made to
--    see if the character found is actually valid other than it starts with an
--    appropriate byte. If p might be the first character of the string, you must use
--    g_utf8_find_prev_char() instead.

--    p :       a pointer to a position within a UTF-8 encoded string
--    Returns : a pointer to the found character.

--    ---------------------------------------------------------------------------------

--   g_utf8_find_next_char ()

--  gchar*      g_utf8_find_next_char           (const gchar *p,
--                                               const gchar *end);

--    Finds the start of the next UTF-8 character in the string after p.

--    p does not have to be at the beginning of a UTF-8 character. No check is made to
--    see if the character found is actually valid other than it starts with an
--    appropriate byte.

--    p :       a pointer to a position within a UTF-8 encoded string
--    end :     a pointer to the end of the string, or NULL to indicate that the string
--              is nul-terminated, in which case the returned value will be
--    Returns : a pointer to the found character or NULL

--    ---------------------------------------------------------------------------------

--   g_utf8_find_prev_char ()

--  gchar*      g_utf8_find_prev_char           (const gchar *str,
--                                               const gchar *p);

--    Given a position p with a UTF-8 encoded string str, find the start of the
--    previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters
--    are present in str before p.

--    p does not have to be at the beginning of a UTF-8 character. No check is made to
--    see if the character found is actually valid other than it starts with an
--    appropriate byte.

--    str :     pointer to the beginning of a UTF-8 encoded string
--    p :       pointer to some position within str
--    Returns : a pointer to the found character or NULL.

--    ---------------------------------------------------------------------------------

--   g_utf8_strlen ()

--  glong       g_utf8_strlen                   (const gchar *p,
--                                               gssize max);

--    Returns the length of the string in characters.

--    p :       pointer to the start of a UTF-8 encoded string.
--    max :     the maximum number of bytes to examine. If max is less than 0, then the
--              string is assumed to be nul-terminated. If max is 0, p will not be
--              examined and may be NULL.
--    Returns : the length of the string in characters

--    ---------------------------------------------------------------------------------

--   g_utf8_strncpy ()

--  gchar*      g_utf8_strncpy                  (gchar *dest,
--                                               const gchar *src,
--                                               gsize n);

--    Like the standard C strncpy() function, but copies a given number of characters
--    instead of a given number of bytes. The src string must be valid UTF-8 encoded
--    text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility
--    functions with it.)

--    dest :    buffer to fill with characters from src
--    src :     UTF-8 encoded string
--    n :       character count
--    Returns : dest

--    ---------------------------------------------------------------------------------

--   g_utf8_strchr ()

--  gchar*      g_utf8_strchr                   (const gchar *p,
--                                               gssize len,
--                                               gunichar c);

--    Finds the leftmost occurrence of the given Unicode character in a UTF-8 encoded
--    string, while limiting the search to len bytes. If len is -1, allow unbounded
--    search.

--    p :       a nul-terminated UTF-8 encoded string
--    len :     the maximum length of p
--    c :       a Unicode character
--    Returns : NULL if the string does not contain the character, otherwise, a pointer
--              to the start of the leftmost occurrence of the character in the string.

--    ---------------------------------------------------------------------------------

--   g_utf8_strrchr ()

--  gchar*      g_utf8_strrchr                  (const gchar *p,
--                                               gssize len,
--                                               gunichar c);

--    Find the rightmost occurrence of the given Unicode character in a UTF-8 encoded
--    string, while limiting the search to len bytes. If len is -1, allow unbounded
--    search.

--    p :       a nul-terminated UTF-8 encoded string
--    len :     the maximum length of p
--    c :       a Unicode character
--    Returns : NULL if the string does not contain the character, otherwise, a pointer
--              to the start of the rightmost occurrence of the character in the
--              string.

--    ---------------------------------------------------------------------------------

--   g_utf8_strreverse ()

--  gchar*      g_utf8_strreverse               (const gchar *str,
--                                               gssize len);

--    Reverses a UTF-8 string. str must be valid UTF-8 encoded text. (Use
--    g_utf8_validate() on all text before trying to use UTF-8 utility functions with
--    it.)

--    Note that unlike g_strreverse(), this function returns newly-allocated memory,
--    which should be freed with g_free() when no longer needed.

--    str :     a UTF-8 encoded string
--    len :     the maximum length of str to use. If len < 0, then the string is
--              nul-terminated.
--    Returns : a newly-allocated string which is the reverse of str.

--    Since 2.2

--    ---------------------------------------------------------------------------------

--   g_utf8_validate ()

--  gboolean    g_utf8_validate                 (const gchar *str,
--                                               gssize max_len,
--                                               const gchar **end);

--    Validates UTF-8 encoded text. str is the text to validate; if str is
--    nul-terminated, then max_len can be -1, otherwise max_len should be the number of
--    bytes to validate. If end is non-NULL, then the end of the valid range will be
--    stored there (i.e. the start of the first invalid character if some bytes were
--    invalid, or the end of the text being validated otherwise).

--    Note that g_utf8_validate() returns FALSE if max_len is positive and NUL is met
--    before max_len bytes have been read.

--    Returns TRUE if all of str was valid. Many GLib and GTK+ routines require valid
--    UTF-8 as input; so data read from a file or the network should be checked with
--    g_utf8_validate() before doing anything else with it.

--    str :     a pointer to character data
--    max_len : max bytes to validate, or -1 to go until NUL
--    end :     return location for end of valid data
--    Returns : TRUE if the text was valid UTF-8

--    ---------------------------------------------------------------------------------

--   g_utf8_strup ()

--  gchar*      g_utf8_strup                    (const gchar *str,
--                                               gssize len);

--    Converts all Unicode characters in the string that have a case to uppercase. The
--    exact manner that this is done depends on the current locale, and may result in
--    the number of characters in the string increasing. (For instance, the German
--    ess-zet will be changed to SS.)

--    str :     a UTF-8 encoded string
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    Returns : a newly allocated string, with all characters converted to uppercase.

--    ---------------------------------------------------------------------------------

--   g_utf8_strdown ()

--  gchar*      g_utf8_strdown                  (const gchar *str,
--                                               gssize len);

--    Converts all Unicode characters in the string that have a case to lowercase. The
--    exact manner that this is done depends on the current locale, and may result in
--    the number of characters in the string changing.

--    str :     a UTF-8 encoded string
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    Returns : a newly allocated string, with all characters converted to lowercase.

--    ---------------------------------------------------------------------------------

--   g_utf8_casefold ()

--  gchar*      g_utf8_casefold                 (const gchar *str,
--                                               gssize len);

--    Converts a string into a form that is independent of case. The result will not
--    correspond to any particular case, but can be compared for equality or ordered
--    with the results of calling g_utf8_casefold() on other strings.

--    Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an
--    approximation to the correct linguistic case insensitive ordering, though it is a
--    fairly good one. Getting this exactly right would require a more sophisticated
--    collation function that takes case sensitivity into account. GLib does not
--    currently provide such a function.

--    str :     a UTF-8 encoded string
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    Returns : a newly allocated string, that is a case independent form of str.

--    ---------------------------------------------------------------------------------

--   g_utf8_normalize ()

--  gchar*      g_utf8_normalize                (const gchar *str,
--                                               gssize len,
--                                               GNormalizeMode mode);

--    Converts a string into canonical form, standardizing such issues as whether a
--    character with an accent is represented as a base character and combining accent
--    or as a single precomposed character. You should generally call
--    g_utf8_normalize() before comparing two Unicode strings.

--    The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do
--    not affect the text content, such as the above-mentioned accent representation.
--    G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such
--    as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting
--    information may be lost but for most text operations such characters should be
--    considered the same. For example, g_utf8_collate() normalizes with
--    G_NORMALIZE_ALL as its first step.

--    G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like
--    G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed
--    forms rather than a maximally decomposed form. This is often useful if you intend
--    to convert the string to a legacy encoding or pass it to a system with less
--    capable Unicode handling.

--    str :     a UTF-8 encoded string.
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    mode :    the type of normalization to perform.
--    Returns : a newly allocated string, that is the normalized form of str.

--    ---------------------------------------------------------------------------------

--   enum GNormalizeMode

--  typedef enum {
--    G_NORMALIZE_DEFAULT,
--    G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
--    G_NORMALIZE_DEFAULT_COMPOSE,
--    G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
--    G_NORMALIZE_ALL,
--    G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
--    G_NORMALIZE_ALL_COMPOSE,
--    G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
--  } GNormalizeMode;

--    Defines how a Unicode string is transformed in a canonical form, standardizing
--    such issues as whether a character with an accent is represented as a base
--    character and combining accent or as a single precomposed character. Unicode
--    strings should generally be normalized before comparing them.

--    G_NORMALIZE_DEFAULT         standardize differences that do not affect the text
--                                content, such as the above-mentioned accent
--                                representation.
--    G_NORMALIZE_NFD             another name for G_NORMALIZE_DEFAULT.
--    G_NORMALIZE_DEFAULT_COMPOSE like G_NORMALIZE_DEFAULT, but with composed forms
--                                rather than a maximally decomposed form.
--    G_NORMALIZE_NFC             another name for G_NORMALIZE_DEFAULT_COMPOSE.
--    G_NORMALIZE_ALL             beyond G_NORMALIZE_DEFAULT also standardize the
--                                "compatibility" characters in Unicode, such as
--                                SUPERSCRIPT THREE to the standard forms (in this case
--                                DIGIT THREE). Formatting information may be lost but
--                                for most text operations such characters should be
--                                considered the same.
--    G_NORMALIZE_NFKD            another name for G_NORMALIZE_ALL.
--    G_NORMALIZE_ALL_COMPOSE     like G_NORMALIZE_ALL, but with composed forms rather
--                                than a maximally decomposed form.
--    G_NORMALIZE_NFKC            another name for G_NORMALIZE_ALL_COMPOSE.

--    ---------------------------------------------------------------------------------

--   g_utf8_collate ()

--  gint        g_utf8_collate                  (const gchar *str1,
--                                               const gchar *str2);

--    Compares two strings for ordering using the linguistically correct rules for the
--    current locale. When sorting a large number of strings, it will be significantly
--    faster to obtain collation keys with g_utf8_collate_key() and compare the keys
--    with strcmp() when sorting instead of sorting the original strings.

--    str1 :    a UTF-8 encoded string
--    str2 :    a UTF-8 encoded string
--    Returns : < 0 if str1 compares before str2, 0 if they compare equal, > 0 if str1
--              compares after str2.

--    ---------------------------------------------------------------------------------

--   g_utf8_collate_key ()

--  gchar*      g_utf8_collate_key              (const gchar *str,
--                                               gssize len);

--    Converts a string into a collation key that can be compared with other collation
--    keys produced by the same function using strcmp(). The results of comparing the
--    collation keys of two strings with strcmp() will always be the same as comparing
--    the two original keys with g_utf8_collate().

--    str :     a UTF-8 encoded string.
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    Returns : a newly allocated string. This string should be freed with g_free()
--              when you are done with it.

--    ---------------------------------------------------------------------------------

--   g_utf8_collate_key_for_filename ()

--  gchar*      g_utf8_collate_key_for_filename (const gchar *str,
--                                               gssize len);

--    Converts a string into a collation key that can be compared with other collation
--    keys produced by the same function using strcmp().

--    In order to sort filenames correctly, this function treats the dot '.' as a
--    special case. Most dictionary orderings seem to consider it insignificant, thus
--    producing the ordering "event.c" "eventgenerator.c" "event.h" instead of
--    "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers
--    intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5"
--    "file10".

--    str :     a UTF-8 encoded string.
--    len :     length of str, in bytes, or -1 if str is nul-terminated.
--    Returns : a newly allocated string. This string should be freed with g_free()
--              when you are done with it.

--    Since 2.8

--    ---------------------------------------------------------------------------------

--   g_utf8_to_utf16 ()

--  gunichar2*  g_utf8_to_utf16                 (const gchar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result
--    after the converted text.

--    str :           a UTF-8 encoded string
--    len :           the maximum length (number of characters) of str to use. If len <
--                    0, then the string is nul-terminated.
--    items_read :    location to store number of bytes read, or NULL. If NULL, then
--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
--                    contains a trailing partial character. If an error occurs then
--                    the index of the invalid input is stored here.
--    items_written : location to store number of gunichar2 written, or NULL. The value
--                    stored here does not include the trailing 0.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UTF-16 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set.

--    ---------------------------------------------------------------------------------

--   g_utf8_to_ucs4 ()

--  gunichar*   g_utf8_to_ucs4                  (const gchar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A
--    trailing 0 will be added to the string after the converted text.

--    str :           a UTF-8 encoded string
--    len :           the maximum length of str to use. If len < 0, then the string is
--                    nul-terminated.
--    items_read :    location to store number of bytes read, or NULL. If NULL, then
--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
--                    contains a trailing partial character. If an error occurs then
--                    the index of the invalid input is stored here.
--    items_written : location to store number of characters written or NULL. The value
--                    here stored does not include the trailing 0 character.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UCS-4 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set.

--    ---------------------------------------------------------------------------------

--   g_utf8_to_ucs4_fast ()

--  gunichar*   g_utf8_to_ucs4_fast             (const gchar *str,
--                                               glong len,
--                                               glong *items_written);

--    Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4,
--    assuming valid UTF-8 input. This function is roughly twice as fast as
--    g_utf8_to_ucs4() but does no error checking on the input.

--    str :           a UTF-8 encoded string
--    len :           the maximum length of str to use. If len < 0, then the string is
--                    nul-terminated.
--    items_written : location to store the number of characters in the result, or
--                    NULL.
--    Returns :       a pointer to a newly allocated UCS-4 string. This value must be
--                    freed with g_free().

--    ---------------------------------------------------------------------------------

--   g_utf16_to_ucs4 ()

--  gunichar*   g_utf16_to_ucs4                 (const gunichar2 *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from UTF-16 to UCS-4. The result will be terminated with a 0
--    character.

--    str :           a UTF-16 encoded string
--    len :           the maximum length (number of gunichar2) of str to use. If len <
--                    0, then the string is terminated with a 0 character.
--    items_read :    location to store number of words read, or NULL. If NULL, then
--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
--                    contains a trailing partial character. If an error occurs then
--                    the index of the invalid input is stored here.
--    items_written : location to store number of characters written, or NULL. The
--                    value stored here does not include the trailing 0 character.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UCS-4 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set.

--    ---------------------------------------------------------------------------------

--   g_utf16_to_utf8 ()

--  gchar*      g_utf16_to_utf8                 (const gunichar2 *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from UTF-16 to UTF-8. The result will be terminated with a 0
--    byte.

--    Note that the input is expected to be already in native endianness, an initial
--    byte-order-mark character is not handled specially. g_convert() can be used to
--    convert a byte buffer of UTF-16 data of ambiguous endianess.

--    str :           a UTF-16 encoded string
--    len :           the maximum length (number of gunichar2) of str to use. If len <
--                    0, then the string is terminated with a 0 character.
--    items_read :    location to store number of words read, or NULL. If NULL, then
--                    G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str
--                    contains a trailing partial character. If an error occurs then
--                    the index of the invalid input is stored here.
--    items_written : location to store number of bytes written, or NULL. The value
--                    stored here does not include the trailing 0 byte.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UTF-8 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set.

--    ---------------------------------------------------------------------------------

--   g_ucs4_to_utf16 ()

--  gunichar2*  g_ucs4_to_utf16                 (const gunichar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from UCS-4 to UTF-16. A 0 character will be added to the result
--    after the converted text.

--    str :           a UCS-4 encoded string
--    len :           the maximum length (number of characters) of str to use. If len <
--                    0, then the string is terminated with a 0 character.
--    items_read :    location to store number of bytes read, or NULL. If an error
--                    occurs then the index of the invalid input is stored here.
--    items_written : location to store number of gunichar2 written, or NULL. The value
--                    stored here does not include the trailing 0.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UTF-16 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set.

--    ---------------------------------------------------------------------------------

--   g_ucs4_to_utf8 ()

--  gchar*      g_ucs4_to_utf8                  (const gunichar *str,
--                                               glong len,
--                                               glong *items_read,
--                                               glong *items_written,
--                                               GError **error);

--    Convert a string from a 32-bit fixed width representation as UCS-4. to UTF-8. The
--    result will be terminated with a 0 byte.

--    str :           a UCS-4 encoded string
--    len :           the maximum length (number of characters) of str to use. If len <
--                    0, then the string is terminated with a 0 character.
--    items_read :    location to store number of characters read, or NULL.
--    items_written : location to store number of bytes written or NULL. The value here
--                    stored does not include the trailing 0 byte.
--    error :         location to store the error occuring, or NULL to ignore errors.
--                    Any of the errors in GConvertError other than
--                    G_CONVERT_ERROR_NO_CONVERSION may occur.
--    Returns :       a pointer to a newly allocated UTF-8 string. This value must be
--                    freed with g_free(). If an error occurs, NULL will be returned
--                    and error set. In that case, items_read will be set to the
--                    position of the first invalid input character.

--    ---------------------------------------------------------------------------------

	unichar_to_utf8 (a_gunichar: INTEGER): STRING is
			-- Converts a single Unicode character to UTF-8.
			-- a_gunichar :       a Unicode character code
			-- Returns : the UTF-8 representation of a_unichar
		local
			size: INTEGER
		do
			create Result.make_filled ('x', 6)
			size := g_unichar_to_utf8 (a_gunichar, Result.to_external)
			Result.keep_head (size)
		end

feature {} -- External calls

	g_unichar_to_utf8 (a_gunichar: INTEGER; an_outbuf: POINTER): INTEGER is
		external "C use <glib.h>"
		end

-- See Also

--    g_locale_to_utf8(), g_locale_from_utf8() Convenience functions for converting
--                                             between UTF-8 and the locale encoding.

--    --------------

--    ^[3] surrogate pairs

end -- deferred class GLIB_UNICODE_MANIPULATION