/sombok-2.2.1/lib/utf8.c
# · C · 321 lines · 255 code · 18 blank · 48 comment · 146 complexity · 1e7576dcd1853c377843a5fa7e9eedc5 MD5 · raw file
- /*
- * utf8.c - Handle UTF-8 sequence.
- *
- * Copyright (C) 2012 by Hatuka*nezumi - IKEDA Soji.
- *
- * This file is part of the Sombok Package. This program is free
- * software; you can redistribute it and/or modify it under the terms of
- * either the GNU General Public License or the Artistic License, as
- * specified in the README file.
- *
- */
- #include "sombok.h"
- /** @defgroup utf8 utf8
- * @brief Handle UTF-8 sequence.
- *
- * @note This module was introduced by release 2.1.0.
- *
- *@{*/
- /** Decode UTF-8 string to Unicode string
- *
- * @param[out] unistr Unicode string, must not be NULL.
- * @param[in] maxchars maximum number of characters to be decoded.
- * 0 means infinite
- * @param[in] utf8 source UTF-8 string
- * @param[in] utf8len length of string
- * @param[in] check 0: no check; 1: check malformed sequence; 2: check
- * surrogate too; 3: check codes beyond Unicode too
- *
- * @returns Unicode string.
- * If unistr->str was NULL or maxchars was 0 (infinite), required buffer will
- * be (re-)allocated.
- * If error occurred, NULL is returned and errno is set.
- *
- * @note unistr->str must not point to static memory.
- */
- unistr_t *sombok_decode_utf8(unistr_t *unistr, size_t maxchars,
- const char *utf8, size_t utf8len, int check)
- {
- size_t i, unilen;
- unichar_t unichar, *uni;
- int pass;
- if (unistr == NULL) {
- errno = EINVAL;
- return NULL;
- }
- uni = unistr->str;
- if (utf8 == NULL)
- utf8len = 0;
- for (pass = 1; pass <= 2; pass++) {
- for (i = 0, unilen = 0; i < utf8len; unilen++) {
- if (maxchars != 0 && maxchars < unilen + 1)
- break;
- if ((utf8[i] & 0x80) == 0) {
- if (pass == 2)
- uni[unilen] = utf8[i];
- i++;
- } else if (i + 1 < utf8len &&
- (utf8[i] & 0xE0) == 0xC0 &&
- (utf8[i + 1] & 0xC0) == 0x80) {
- if (pass == 2) {
- unichar = utf8[i] & 0x1F;
- unichar <<= 6;
- unichar |= utf8[i + 1] & 0x3F;
- uni[unilen] = unichar;
- }
- i += 2;
- } else if (i + 2 < utf8len &&
- (utf8[i] & 0xF0) == 0xE0 &&
- (utf8[i + 1] & 0xC0) == 0x80 &&
- (utf8[i + 2] & 0xC0) == 0x80) {
- if (SOMBOK_UTF8_CHECK_SURROGATE <= check &&
- (utf8[i] & 0x0F) == 0x0D && (utf8[i + 1] & 0x20) == 0x20) {
- errno = EPERM;
- return NULL;
- }
- if (pass == 2) {
- unichar = utf8[i] & 0x0F;
- unichar <<= 6;
- unichar |= utf8[i + 1] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 2] & 0x3F;
- uni[unilen] = unichar;
- }
- i += 3;
- } else if (i + 3 < utf8len &&
- (utf8[i] & 0xF8) == 0xF0 &&
- (utf8[i + 1] & 0xC0) == 0x80 &&
- (utf8[i + 2] & 0xC0) == 0x80 &&
- (utf8[i + 3] & 0xC0) == 0x80) {
- if (SOMBOK_UTF8_CHECK_NONUNICODE <= check &&
- 0x10 <
- (((utf8[i] & 0x07) << 2) | ((utf8[i + 1] & 0x30) >> 4))) {
- errno = EPERM;
- return NULL;
- }
- if (pass == 2) {
- unichar = utf8[i] & 0x07;
- unichar <<= 6;
- unichar |= utf8[i + 1] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 2] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 3] & 0x3F;
- uni[unilen] = unichar;
- }
- i += 4;
- } else if (SOMBOK_UTF8_CHECK_NONUNICODE <= check) {
- errno = EPERM;
- return NULL;
- } else if (i + 4 < utf8len &&
- (utf8[i] & 0xFC) == 0xF8 &&
- (utf8[i + 1] & 0xC0) == 0x80 &&
- (utf8[i + 2] & 0xC0) == 0x80 &&
- (utf8[i + 3] & 0xC0) == 0x80 &&
- (utf8[i + 4] & 0xC0) == 0x80) {
- if (pass == 2) {
- unichar = utf8[i] & 0x03;
- unichar <<= 6;
- unichar |= utf8[i + 1] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 2] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 3] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 4] & 0x3F;
- uni[unilen] = unichar;
- }
- i += 5;
- } else if (i + 5 < utf8len &&
- (utf8[i] & 0xFE) == 0xFC &&
- (utf8[i + 1] & 0xC0) == 0x80 &&
- (utf8[i + 2] & 0xC0) == 0x80 &&
- (utf8[i + 3] & 0xC0) == 0x80 &&
- (utf8[i + 4] & 0xC0) == 0x80 &&
- (utf8[i + 5] & 0xC0) == 0x80) {
- if (pass == 2) {
- unichar = utf8[i] & 0x01;
- unichar <<= 6;
- unichar |= utf8[i + 1] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 2] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 3] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 4] & 0x3F;
- unichar <<= 6;
- unichar |= utf8[i + 5] & 0x3F;
- uni[unilen] = unichar;
- }
- i += 6;
- } else {
- if (SOMBOK_UTF8_CHECK_MALFORMED <= check) {
- errno = EPERM;
- return NULL;
- }
- if (pass == 2)
- uni[unilen] = utf8[i];
- i++;
- }
- }
- if (pass == 1) {
- if (uni == NULL) {
- if ((uni = malloc(sizeof(unichar_t) * (unilen + 1))) == NULL)
- return NULL;
- uni[unilen] = 0;
- } else if (maxchars == 0) {
- if ((uni = realloc(uni,
- sizeof(unichar_t) * (unilen + 1))) == NULL)
- return NULL;
- uni[unilen] = 0;
- } else if (unilen < maxchars)
- uni[unilen] = 0;
- unistr->str = uni;
- unistr->len = unilen;
- }
- }
- return unistr;
- }
- /** Encode Unicode string to UTF-8 string
- *
- * @param[out] utf8 string buffer, may be NULL.
- * @param[out] utf8lenp pointer to length of buffer, may be NULL.
- * @param[in] maxbytes maximum number of bytes to be encoded. 0 means infinite
- * @param[in] unistr source Unicode string, must not be NULL.
- *
- * @returns string buffer.
- * If utf8 was NULL or maxbytes was 0 (infinite), required buffer will be
- * (re-)allocated.
- * If error occurred, NULL is returned and errno is set.
- *
- * @note utf8 must not point to static memory.
- */
- char *sombok_encode_utf8(char *utf8, size_t *utf8lenp, size_t maxbytes,
- unistr_t *unistr)
- {
- size_t i, utf8len, unilen;
- unichar_t unichar;
- int pass;
- if (unistr == NULL) {
- errno = EINVAL;
- return NULL;
- }
- if (unistr->str == NULL)
- unilen = 0;
- else
- unilen = unistr->len;
- for (pass = 1; pass <= 2; pass++) {
- for (i = 0, utf8len = 0; i < unilen; i++) {
- unichar = unistr->str[i];
- if (unichar == (unichar & 0x007F)) {
- if (maxbytes != 0 && maxbytes < utf8len + 1)
- break;
- if (pass == 2)
- utf8[utf8len] = (char) unichar;
- utf8len++;
- } else if (unichar == (unichar & 0x07FF)) {
- if (maxbytes != 0 && maxbytes < utf8len + 2)
- break;
- if (pass == 2) {
- utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len] = (char) (unichar & 0x1F) | 0xC0;
- }
- utf8len += 2;
- } else if (unichar == (unichar & 0x00FFFF)) {
- if (maxbytes != 0 && maxbytes < utf8len + 3)
- break;
- if (pass == 2) {
- utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len] = (char) (unichar & 0x0F) | 0xE0;
- }
- utf8len += 3;
- } else if (unichar == (unichar & 0x001FFFFF)) {
- if (maxbytes != 0 && maxbytes < utf8len + 4)
- break;
- if (pass == 2) {
- utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len] = (char) (unichar & 0x07) | 0xF0;
- }
- utf8len += 4;
- } else if (unichar == (unichar & 0x03FFFFFF)) {
- if (maxbytes != 0 && maxbytes < utf8len + 5)
- break;
- if (pass == 2) {
- utf8[utf8len + 4] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len] = (char) (unichar & 0x03) | 0xF8;
- }
- utf8len += 5;
- } else if (unichar == (unichar & 0x7FFFFFFF)) {
- if (maxbytes != 0 && maxbytes < utf8len + 6)
- break;
- if (pass == 2) {
- utf8[utf8len + 5] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 4] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 3] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 2] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len + 1] = (char) (unichar & 0x3F) | 0x80;
- unichar >>= 6;
- utf8[utf8len] = (char) (unichar & 0x01) | 0xFC;
- }
- utf8len += 6;
- } else {
- errno = EPERM;
- return NULL;
- }
- }
- if (pass == 1) {
- if (utf8 == NULL) {
- if ((utf8 = malloc(sizeof(char) * (utf8len + 1))) == NULL)
- return NULL;
- utf8[utf8len] = '\0';
- } else if (maxbytes == 0) {
- if ((utf8 = realloc(utf8,
- sizeof(char) * (utf8len + 1))) == NULL)
- return NULL;
- utf8[utf8len] = '\0';
- } else if (utf8len < maxbytes)
- utf8[utf8len] = '\0';
- if (utf8lenp != NULL)
- *utf8lenp = utf8len;
- }
- }
- return utf8;
- }