PageRenderTime 243ms CodeModel.GetById 140ms app.highlight 13ms RepoModel.GetById 88ms app.codeStats 0ms

/jansson/src/utf.c

http://github.com/nicolasff/webdis
C | 190 lines | 143 code | 29 blank | 18 comment | 66 complexity | ea60c1c08659281709509bbaee02fd38 MD5 | raw file
  1/*
  2 * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
  3 *
  4 * Jansson is free software; you can redistribute it and/or modify
  5 * it under the terms of the MIT license. See LICENSE for details.
  6 */
  7
  8#include <string.h>
  9#include "utf.h"
 10
 11int utf8_encode(int32_t codepoint, char *buffer, int *size)
 12{
 13    if(codepoint < 0)
 14        return -1;
 15    else if(codepoint < 0x80)
 16    {
 17        buffer[0] = (char)codepoint;
 18        *size = 1;
 19    }
 20    else if(codepoint < 0x800)
 21    {
 22        buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
 23        buffer[1] = 0x80 + ((codepoint & 0x03F));
 24        *size = 2;
 25    }
 26    else if(codepoint < 0x10000)
 27    {
 28        buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
 29        buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
 30        buffer[2] = 0x80 + ((codepoint & 0x003F));
 31        *size = 3;
 32    }
 33    else if(codepoint <= 0x10FFFF)
 34    {
 35        buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
 36        buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
 37        buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
 38        buffer[3] = 0x80 + ((codepoint & 0x00003F));
 39        *size = 4;
 40    }
 41    else
 42        return -1;
 43
 44    return 0;
 45}
 46
 47int utf8_check_first(char byte)
 48{
 49    unsigned char u = (unsigned char)byte;
 50
 51    if(u < 0x80)
 52        return 1;
 53
 54    if(0x80 <= u && u <= 0xBF) {
 55        /* second, third or fourth byte of a multi-byte
 56           sequence, i.e. a "continuation byte" */
 57        return 0;
 58    }
 59    else if(u == 0xC0 || u == 0xC1) {
 60        /* overlong encoding of an ASCII byte */
 61        return 0;
 62    }
 63    else if(0xC2 <= u && u <= 0xDF) {
 64        /* 2-byte sequence */
 65        return 2;
 66    }
 67
 68    else if(0xE0 <= u && u <= 0xEF) {
 69        /* 3-byte sequence */
 70        return 3;
 71    }
 72    else if(0xF0 <= u && u <= 0xF4) {
 73        /* 4-byte sequence */
 74        return 4;
 75    }
 76    else { /* u >= 0xF5 */
 77        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
 78           UTF-8 */
 79        return 0;
 80    }
 81}
 82
 83int utf8_check_full(const char *buffer, int size, int32_t *codepoint)
 84{
 85    int i;
 86    int32_t value = 0;
 87    unsigned char u = (unsigned char)buffer[0];
 88
 89    if(size == 2)
 90    {
 91        value = u & 0x1F;
 92    }
 93    else if(size == 3)
 94    {
 95        value = u & 0xF;
 96    }
 97    else if(size == 4)
 98    {
 99        value = u & 0x7;
100    }
101    else
102        return 0;
103
104    for(i = 1; i < size; i++)
105    {
106        u = (unsigned char)buffer[i];
107
108        if(u < 0x80 || u > 0xBF) {
109            /* not a continuation byte */
110            return 0;
111        }
112
113        value = (value << 6) + (u & 0x3F);
114    }
115
116    if(value > 0x10FFFF) {
117        /* not in Unicode range */
118        return 0;
119    }
120
121    else if(0xD800 <= value && value <= 0xDFFF) {
122        /* invalid code point (UTF-16 surrogate halves) */
123        return 0;
124    }
125
126    else if((size == 2 && value < 0x80) ||
127            (size == 3 && value < 0x800) ||
128            (size == 4 && value < 0x10000)) {
129        /* overlong encoding */
130        return 0;
131    }
132
133    if(codepoint)
134        *codepoint = value;
135
136    return 1;
137}
138
139const char *utf8_iterate(const char *buffer, int32_t *codepoint)
140{
141    int count;
142    int32_t value;
143
144    if(!*buffer)
145        return buffer;
146
147    count = utf8_check_first(buffer[0]);
148    if(count <= 0)
149        return NULL;
150
151    if(count == 1)
152        value = (unsigned char)buffer[0];
153    else
154    {
155        if(!utf8_check_full(buffer, count, &value))
156            return NULL;
157    }
158
159    if(codepoint)
160        *codepoint = value;
161
162    return buffer + count;
163}
164
165int utf8_check_string(const char *string, int length)
166{
167    int i;
168
169    if(length == -1)
170        length = strlen(string);
171
172    for(i = 0; i < length; i++)
173    {
174        int count = utf8_check_first(string[i]);
175        if(count == 0)
176            return 0;
177        else if(count > 1)
178        {
179            if(i + count > length)
180                return 0;
181
182            if(!utf8_check_full(&string[i], count, NULL))
183                return 0;
184
185            i += count - 1;
186        }
187    }
188
189    return 1;
190}