/src/wrappers/glib/library/utilities/glib_character_set_conversion.e
Specman e | 570 lines | 76 code | 120 blank | 374 comment | 6 complexity | 88105fb02acffcbdc1c72e0a234787f1 MD5 | raw file
1indexing 2 description: "C string Utility Functions -- various C-string-related functions." 3 copyright: "[ 4 Copyright (C) 2007 Anthony Lenton, Soluciones Informaticas Libres S.A., 5 GLib team 6 7 This library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public License 9 as published by the Free Software Foundation; either version 2.1 of 10 the License, or (at your option) any later version. 11 12 This library is distributed in the hopeOA that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with this library; if not, write to the Free Software 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 20 02110-1301 USA 21 ]" 22 23deferred class GLIB_CHARACTER_SET_CONVERSION 24 25feature 26 27 gconvert (a_string, to_codeset, from_codeset: STRING): STRING is 28 -- Converts a string from one character set to another. 29 -- Note that you should use iconv() for streaming conversions. 30 -- a_strint : the string to convert 31 -- to_codeset : name of character set into which to convert str 32 -- from_codeset : character set of str. 33 -- Returns : If the conversion was successful, a newly allocated 34 -- string. Otherwise Result is Void. 35 local 36 res_ptr: POINTER 37 bytes_read, bytes_written: INTEGER 38 do 39 res_ptr := g_convert (a_string.to_external, a_string.count, to_codeset.to_external, 40 from_codeset.to_external, $bytes_read, $bytes_written, default_pointer) 41 if res_ptr.is_not_null then 42 create Result.from_external (res_ptr) 43 end 44 end 45 46 locale_to_utf8 (a_string: STRING): STRING is 47 -- Converts a string which is in the encoding used for strings by the C runtime 48 -- (usually the same as that used by the operating system) in the current locale 49 -- into a UTF-8 string. 50 -- a_string : a string in the encoding of the current locale. On Windows this 51 -- means the system codepage. 52 -- Returns : The converted string, or Void on an error. 53 local 54 res_ptr: POINTER 55 bytes_read, bytes_written: INTEGER 56 do 57 res_ptr := g_locale_to_utf8 (a_string.to_external, a_string.count, $bytes_read, $bytes_written, default_pointer) 58 if res_ptr.is_not_null then 59 create Result.from_external (res_ptr) 60 end 61 end 62 63 filename_from_utf8 (an_utf8_string: STRING): STRING is 64 -- Converts a string from UTF-8 to the encoding GLib uses for filenames. Note that 65 -- on Windows GLib uses UTF-8 for filenames. 66 -- an_utf8string : a UTF-8 encoded string. 67 -- Returns : The converted string, or Void on an error. 68 local 69 res_ptr: POINTER 70 bytes_read, bytes_written: INTEGER 71 do 72 res_ptr := g_filename_from_utf8 (an_utf8_string.to_external, an_utf8_string.count, $bytes_read, $bytes_written, default_pointer) 73 if res_ptr.is_not_null then 74 create Result.from_external (res_ptr) 75 end 76 end 77 78 locale_from_utf8 (a_string: STRING): STRING is 79 -- Converts a string from UTF-8 to the encoding used for strings by the C runtime 80 -- (usually the same as that used by the operating system) in the current locale. 81 -- a_string : a UTF-8 encoded string 82 -- Returns : The converted string, or Void on an error. 83 local 84 res_ptr: POINTER 85 bytes_read, bytes_written: INTEGER 86 do 87 res_ptr := g_locale_from_utf8 (a_string.to_external, a_string.count, $bytes_read, $bytes_written, default_pointer) 88 if res_ptr.is_not_null then 89 create Result.from_external (res_ptr) 90 end 91 end 92 93feature {} -- External calls 94 95 g_convert (a_str: POINTER; a_len: INTEGER; to_codeset, from_codeset, bytes_read, bytes_written, error: POINTER): POINTER is 96 -- Converts a string from one character set to another. 97 -- Note that you should use g_iconv() for streaming conversions^[2]. 98 -- str : the string to convert 99 -- len : the length of the string, or -1 if the string is 100 -- nul-terminated. Note that some encodings may 101 -- allow nul bytes to occur inside strings. In that 102 -- case, using -1 for the len parameter is unsafe. 103 -- to_codeset : name of character set into which to convert str 104 -- from_codeset : character set of str. 105 -- bytes_read : location to store the number of bytes in the input string that 106 -- were successfully converted, or NULL. Even if the conversion was 107 -- successful, this may be less than len if there were partial 108 -- characters at the end of the input. If the error 109 -- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 110 -- the byte offset after the last valid input sequence. 111 -- bytes_written : the number of bytes stored in the output buffer (not including 112 -- the terminating nul). 113 -- error : location to store the error occuring, or NULL to ignore errors. 114 -- Any of the errors in GConvertError may occur. 115 -- Returns : If the conversion was successful, a newly allocated 116 -- nul-terminated string, which must be freed with g_free(). 117 -- Otherwise NULL and error will be set. 118 external "C use <glib.h>" 119 end 120 121-- --------------------------------------------------------------------------------- 122 123-- g_convert_with_fallback () 124 125-- gchar* g_convert_with_fallback (const gchar *str, 126-- gssize len, 127-- const gchar *to_codeset, 128-- const gchar *from_codeset, 129-- gchar *fallback, 130-- gsize *bytes_read, 131-- gsize *bytes_written, 132-- GError **error); 133 134-- Converts a string from one character set to another, possibly including fallback 135-- sequences for characters not representable in the output. Note that it is not 136-- guaranteed that the specification for the fallback sequences in fallback will be 137-- honored. Some systems may do a approximate conversion from from_codeset to 138-- to_codeset in their iconv() functions, in which case GLib will simply return that 139-- approximate conversion. 140 141-- Note that you should use g_iconv() for streaming conversions^[2]. 142 143-- str : the string to convert 144-- len : the length of the string, or -1 if the string is 145-- nul-terminated^[1]. 146-- to_codeset : name of character set into which to convert str 147-- from_codeset : character set of str. 148-- fallback : UTF-8 string to use in place of character not present in the 149-- target encoding. (The string must be representable in the target 150-- encoding). If NULL, characters not in the target encoding will be 151-- represented as Unicode escapes \uxxxx or \Uxxxxyyyy. 152-- bytes_read : location to store the number of bytes in the input string that 153-- were successfully converted, or NULL. Even if the conversion was 154-- successful, this may be less than len if there were partial 155-- characters at the end of the input. 156-- bytes_written : the number of bytes stored in the output buffer (not including 157-- the terminating nul). 158-- error : location to store the error occuring, or NULL to ignore errors. 159-- Any of the errors in GConvertError may occur. 160-- Returns : If the conversion was successful, a newly allocated 161-- nul-terminated string, which must be freed with g_free(). 162-- Otherwise NULL and error will be set. 163 164-- --------------------------------------------------------------------------------- 165 166-- GIConv 167 168-- typedef struct _GIConv GIConv; 169 170-- The GIConv struct wraps an iconv() conversion descriptor. It contains private 171-- data and should only be accessed using the following functions. 172 173-- --------------------------------------------------------------------------------- 174 175-- g_convert_with_iconv () 176 177-- gchar* g_convert_with_iconv (const gchar *str, 178-- gssize len, 179-- GIConv converter, 180-- gsize *bytes_read, 181-- gsize *bytes_written, 182-- GError **error); 183 184-- Converts a string from one character set to another. 185 186-- Note that you should use g_iconv() for streaming conversions^[2]. 187 188-- str : the string to convert 189-- len : the length of the string, or -1 if the string is 190-- nul-terminated^[1]. 191-- converter : conversion descriptor from g_iconv_open() 192-- bytes_read : location to store the number of bytes in the input string that 193-- were successfully converted, or NULL. Even if the conversion was 194-- successful, this may be less than len if there were partial 195-- characters at the end of the input. If the error 196-- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 197-- the byte offset after the last valid input sequence. 198-- bytes_written : the number of bytes stored in the output buffer (not including 199-- the terminating nul). 200-- error : location to store the error occuring, or NULL to ignore errors. 201-- Any of the errors in GConvertError may occur. 202-- Returns : If the conversion was successful, a newly allocated 203-- nul-terminated string, which must be freed with g_free(). 204-- Otherwise NULL and error will be set. 205 206-- --------------------------------------------------------------------------------- 207 208-- G_CONVERT_ERROR 209 210-- #define G_CONVERT_ERROR g_convert_error_quark() 211 212-- Error domain for character set conversions. Errors in this domain will be from 213-- the GConvertError enumeration. See GError for information on error domains. 214 215-- --------------------------------------------------------------------------------- 216 217-- g_iconv_open () 218 219-- GIConv g_iconv_open (const gchar *to_codeset, 220-- const gchar *from_codeset); 221 222-- Same as the standard UNIX routine iconv_open(), but may be implemented via 223-- libiconv on UNIX flavors that lack a native implementation. 224 225-- GLib provides g_convert() and g_locale_to_utf8() which are likely more convenient 226-- than the raw iconv wrappers. 227 228-- to_codeset : destination codeset 229-- from_codeset : source codeset 230-- Returns : a "conversion descriptor", or (GIConv)-1 if opening the converter 231-- failed. 232 233-- --------------------------------------------------------------------------------- 234 235-- g_iconv () 236 237-- size_t g_iconv (GIConv converter, 238-- gchar **inbuf, 239-- gsize *inbytes_left, 240-- gchar **outbuf, 241-- gsize *outbytes_left); 242 243-- Same as the standard UNIX routine iconv(), but may be implemented via libiconv on 244-- UNIX flavors that lack a native implementation. 245 246-- GLib provides g_convert() and g_locale_to_utf8() which are likely more convenient 247-- than the raw iconv wrappers. 248 249-- converter : conversion descriptor from g_iconv_open() 250-- inbuf : bytes to convert 251-- inbytes_left : inout parameter, bytes remaining to convert in inbuf 252-- outbuf : converted output bytes 253-- outbytes_left : inout parameter, bytes available to fill in outbuf 254-- Returns : count of non-reversible conversions, or -1 on error 255 256-- --------------------------------------------------------------------------------- 257 258-- g_iconv_close () 259 260-- gint g_iconv_close (GIConv converter); 261 262-- Same as the standard UNIX routine iconv_close(), but may be implemented via 263-- libiconv on UNIX flavors that lack a native implementation. Should be called to 264-- clean up the conversion descriptor from g_iconv_open() when you are done 265-- converting things. 266 267-- GLib provides g_convert() and g_locale_to_utf8() which are likely more convenient 268-- than the raw iconv wrappers. 269 270-- converter : a conversion descriptor from g_iconv_open() 271-- Returns : -1 on error, 0 on success 272 273-- --------------------------------------------------------------------------------- 274 275 g_locale_to_utf8 (opsysstring: POINTER; opsysstring_len: INTEGER; bytes_read, bytes_written, error: POINTER): POINTER is 276 -- Converts a string which is in the encoding used for strings by the C runtime 277 -- (usually the same as that used by the operating system) in the current locale 278 -- into a UTF-8 string. 279 280 -- opsysstring : a string in the encoding of the current locale. On Windows this 281 -- means the system codepage. 282 -- opsysstring_len : the length of the string, or -1 if the string is 283 -- nul-terminated^[1]. 284 -- bytes_read : location to store the number of bytes in the input string that 285 -- were successfully converted, or NULL. Even if the conversion was 286 -- successful, this may be less than len if there were partial 287 -- characters at the end of the input. If the error 288 -- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 289 -- the byte offset after the last valid input sequence. 290 -- bytes_written : the number of bytes stored in the output buffer (not including 291 -- the terminating nul). 292 -- error : location to store the error occuring, or NULL to ignore errors. 293 -- Any of the errors in GConvertError may occur. 294 -- Returns : The converted string, or NULL on an error. 295 external "C use <glib.h>" 296 end 297 298-- --------------------------------------------------------------------------------- 299 300-- g_filename_to_utf8 () 301 302-- gchar* g_filename_to_utf8 (const gchar *opsysstring, 303-- gssize len, 304-- gsize *bytes_read, 305-- gsize *bytes_written, 306-- GError **error); 307 308-- Converts a string which is in the encoding used by GLib for filenames into a 309-- UTF-8 string. Note that on Windows GLib uses UTF-8 for filenames. 310 311-- opsysstring : a string in the encoding for filenames 312-- len : the length of the string, or -1 if the string is 313-- nul-terminated^[1]. 314-- bytes_read : location to store the number of bytes in the input string that 315-- were successfully converted, or NULL. Even if the conversion was 316-- successful, this may be less than len if there were partial 317-- characters at the end of the input. If the error 318-- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 319-- the byte offset after the last valid input sequence. 320-- bytes_written : the number of bytes stored in the output buffer (not including 321-- the terminating nul). 322-- error : location to store the error occuring, or NULL to ignore errors. 323-- Any of the errors in GConvertError may occur. 324-- Returns : The converted string, or NULL on an error. 325 326-- --------------------------------------------------------------------------------- 327 328 g_filename_from_utf8 (an_utf8_string: POINTER; a_len: INTEGER; 329 a_bytes_read, a_bytes_written, an_error: POINTER): POINTER is 330 -- Converts a string from UTF-8 to the encoding GLib uses for filenames. Note that 331 -- on Windows GLib uses UTF-8 for filenames. 332 -- utf8string : a UTF-8 encoded string. 333 -- len : the length of the string, or -1 if the string is nul-terminated. 334 -- bytes_read : location to store the number of bytes in the input string that 335 -- were successfully converted, or NULL. Even if the conversion was 336 -- successful, this may be less than len if there were partial 337 -- characters at the end of the input. If the error 338 -- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 339 -- the byte offset after the last valid input sequence. 340 -- bytes_written : the number of bytes stored in the output buffer (not including 341 -- the terminating nul). 342 -- error : location to store the error occuring, or NULL to ignore errors. 343 -- Any of the errors in GConvertError may occur. 344 -- Returns : The converted string, or NULL on an error. 345 external "C use <glib.h>" 346 end 347 348-- --------------------------------------------------------------------------------- 349 350-- g_filename_from_uri () 351 352-- gchar* g_filename_from_uri (const gchar *uri, 353-- gchar **hostname, 354-- GError **error); 355 356-- Converts an escaped ASCII-encoded URI to a local filename in the encoding used 357-- for filenames. 358 359-- uri : a uri describing a filename (escaped, encoded in ASCII). 360-- hostname : Location to store hostname for the URI, or NULL. If there is no 361-- hostname in the URI, NULL will be stored in this location. 362-- error : location to store the error occuring, or NULL to ignore errors. Any of 363-- the errors in GConvertError may occur. 364-- Returns : a newly-allocated string holding the resulting filename, or NULL on an 365-- error. 366 367-- --------------------------------------------------------------------------------- 368 369-- g_filename_to_uri () 370 371-- gchar* g_filename_to_uri (const gchar *filename, 372-- const gchar *hostname, 373-- GError **error); 374 375-- Converts an absolute filename to an escaped ASCII-encoded URI, with the path 376-- component following Section 3.3. of RFC 2396. 377 378-- filename : an absolute filename specified in the GLib file name encoding, which 379-- is the on-disk file name bytes on Unix, and UTF-8 on Windows 380-- hostname : A UTF-8 encoded hostname, or NULL for none. 381-- error : location to store the error occuring, or NULL to ignore errors. Any of 382-- the errors in GConvertError may occur. 383-- Returns : a newly-allocated string holding the resulting URI, or NULL on an 384-- error. 385 386-- --------------------------------------------------------------------------------- 387 388-- g_get_filename_charsets () 389 390-- gboolean g_get_filename_charsets (G_CONST_RETURN gchar ***charsets); 391 392-- Determines the preferred character sets used for filenames. The first character 393-- set from the charsets is the filename encoding, the subsequent character sets are 394-- used when trying to generate a displayable representation of a filename, see 395-- g_filename_display_name(). 396 397-- On Unix, the character sets are determined by consulting the environment 398-- variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES. On Windows, the character 399-- set used in the GLib API is always UTF-8 and said environment variables have no 400-- effect. 401 402-- G_FILENAME_ENCODING may be set to a comma-separated list of character set names. 403-- The special token "locale" is taken to mean the character set for the current 404-- locale. If G_FILENAME_ENCODING is not set, but G_BROKEN_FILENAMES is, the 405-- character set of the current locale is taken as the filename encoding. If neither 406-- environment variable is set, UTF-8 is taken as the filename encoding, but the 407-- character set of the current locale is also put in the list of encodings. 408 409-- The returned charsets belong to GLib and must not be freed. 410 411-- Note that on Unix, regardless of the locale character set or G_FILENAME_ENCODING 412-- value, the actual file names present on a system might be in any random encoding 413-- or just gibberish. 414 415-- charsets : return location for the NULL-terminated list of encoding names 416-- Returns : TRUE if the filename encoding is UTF-8. 417 418-- Since 2.6 419 420-- --------------------------------------------------------------------------------- 421 422-- g_filename_display_name () 423 424-- gchar* g_filename_display_name (const gchar *filename); 425 426-- Converts a filename into a valid UTF-8 string. The conversion is not necessarily 427-- reversible, so you should keep the original around and use the return value of 428-- this function only for display purposes. Unlike g_filename_to_utf8(), the result 429-- is guaranteed to be non-NULL even if the filename actually isn't in the GLib file 430-- name encoding. 431 432-- If GLib can not make sense of the encoding of filename, as a last resort it 433-- replaces unknown characters with U+FFFD, the Unicode replacement character. You 434-- can search the result for the UTF-8 encoding of this character (which is 435-- "\357\277\275" in octal notation) to find out if filename was in an invalid 436-- encoding. 437 438-- If you know the whole pathname of the file you should use 439-- g_filename_display_basename(), since that allows location-based translation of 440-- filenames. 441 442-- filename : a pathname hopefully in the GLib file name encoding 443-- Returns : a newly allocated string containing a rendition of the filename in 444-- valid UTF-8 445 446-- Since 2.6 447 448-- --------------------------------------------------------------------------------- 449 450-- g_filename_display_basename () 451 452-- gchar* g_filename_display_basename (const gchar *filename); 453 454-- Returns the display basename for the particular filename, guaranteed to be valid 455-- UTF-8. The display name might not be identical to the filename, for instance 456-- there might be problems converting it to UTF-8, and some files can be translated 457-- in the display. 458 459-- If GLib can not make sense of the encoding of filename, as a last resort it 460-- replaces unknown characters with U+FFFD, the Unicode replacement character. You 461-- can search the result for the UTF-8 encoding of this character (which is 462-- "\357\277\275" in octal notation) to find out if filename was in an invalid 463-- encoding. 464 465-- You must pass the whole absolute pathname to this functions so that translation 466-- of well known locations can be done. 467 468-- This function is preferred over g_filename_display_name() if you know the whole 469-- path, as it allows translation. 470 471-- filename : an absolute pathname in the GLib file name encoding 472-- Returns : a newly allocated string containing a rendition of the basename of the 473-- filename in valid UTF-8 474 475-- Since 2.6 476 477-- --------------------------------------------------------------------------------- 478 479-- g_uri_list_extract_uris () 480 481-- gchar** g_uri_list_extract_uris (const gchar *uri_list); 482 483-- Splits an URI list conforming to the text/uri-list mime type defined in RFC 2483 484-- into individual URIs, discarding any comments. The URIs are not validated. 485 486-- uri_list : an URI list 487-- Returns : a newly allocated NULL-terminated list of strings holding the 488-- individual URIs. The array should be freed with g_strfreev(). 489 490-- Since 2.6 491 492-- --------------------------------------------------------------------------------- 493 494 g_locale_from_utf8 (a_string: POINTER; a_string_length: INTEGER; bytes_read, bytes_written, error: POINTER): POINTER is 495 -- Converts a string from UTF-8 to the encoding used for strings by the C runtime 496 -- (usually the same as that used by the operating system) in the current locale. 497 -- utf8string : a UTF-8 encoded string 498 -- len : the length of the string, or -1 if the string is 499 -- nul-terminated^[1]. 500 -- bytes_read : location to store the number of bytes in the input string that 501 -- were successfully converted, or NULL. Even if the conversion was 502 -- successful, this may be less than len if there were partial 503 -- characters at the end of the input. If the error 504 -- G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value stored will 505 -- the byte offset after the last valid input sequence. 506 -- bytes_written : the number of bytes stored in the output buffer (not including 507 -- the terminating nul). 508 -- error : location to store the error occuring, or NULL to ignore errors. 509 -- Any of the errors in GConvertError may occur. 510 -- Returns : The converted string, or NULL on an error. 511 external "C use <glib.h>" 512 end 513 514-- --------------------------------------------------------------------------------- 515 516-- enum GConvertError 517 518-- typedef enum 519-- { 520-- G_CONVERT_ERROR_NO_CONVERSION, 521-- G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 522-- G_CONVERT_ERROR_FAILED, 523-- G_CONVERT_ERROR_PARTIAL_INPUT, 524-- G_CONVERT_ERROR_BAD_URI, 525-- G_CONVERT_ERROR_NOT_ABSOLUTE_PATH 526-- } GConvertError; 527 528-- Error codes returned by character set conversion routines. 529 530-- G_CONVERT_ERROR_NO_CONVERSION Conversion between the requested character sets 531-- is not supported. 532-- G_CONVERT_ERROR_ILLEGAL_SEQUENCE Invalid byte sequence in conversion input. 533-- G_CONVERT_ERROR_FAILED Conversion failed for some reason. 534-- G_CONVERT_ERROR_PARTIAL_INPUT Partial character sequence at end of input. 535-- G_CONVERT_ERROR_BAD_URI URI is invalid. 536-- G_CONVERT_ERROR_NOT_ABSOLUTE_PATH Pathname is not an absolute path. 537 538-- --------------------------------------------------------------------------------- 539 540-- g_get_charset () 541 542-- gboolean g_get_charset (G_CONST_RETURN char **charset); 543 544-- Obtains the character set for the current locale; you might use this character 545-- set as an argument to g_convert(), to convert from the current locale's encoding 546-- to some other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8() 547-- are nice shortcuts, though.) 548 549-- The return value is TRUE if the locale's encoding is UTF-8, in that case you can 550-- perhaps avoid calling g_convert(). 551 552-- The string returned in charset is not allocated, and should not be freed. 553 554-- charset : return location for character set name 555-- Returns : TRUE if the returned charset is UTF-8 556 557-- -------------- 558 559-- ^[1] Note that some encodings may allow nul bytes to occur inside strings. In 560-- that case, using -1 for the len parameter is unsafe. 561 562-- ^[2] Despite the fact that byes_read can return information about partial 563-- characters, the g_convert_... functions are not generally suitable for streaming. 564-- If the underlying converter being used maintains internal state, then this won't 565-- be preserved across successive calls to g_convert(), g_convert_with_iconv() or 566-- g_convert_with_fallback(). (An example of this is the GNU C converter for CP1255 567-- which does not emit a base character until it knows that the next character is 568-- not a mark that could combine with the base character.) 569end -- class GLIB_CHARACTER_SET_CONVERSION 570