/js/lib/Socket.IO-node/support/expresso/deps/jscoverage/encoding.c
C | 315 lines | 259 code | 37 blank | 19 comment | 31 complexity | 7a327ada998ce4cafc9bf50ce0c0d1f2 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, MPL-2.0-no-copyleft-exception, BSD-3-Clause
1/* 2 encoding.c - character encoding 3 Copyright (C) 2008 siliconforks.com 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, write to the Free Software Foundation, Inc., 17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18*/ 19 20#include <config.h> 21 22#include "encoding.h" 23 24#include <assert.h> 25#include <limits.h> 26#include <string.h> 27 28#ifdef HAVE_ICONV_H 29#include <iconv.h> 30#elif defined HAVE_WINDOWS_H 31#include <windows.h> 32#endif 33 34#include "util.h" 35 36static void skip_bom(jschar ** characters, size_t * num_characters) { 37 jschar * c = *characters; 38 size_t nc = *num_characters; 39 40 size_t i; 41 for (i = 0; i < nc; i++) { 42 if (c[i] != 0xfeff) { 43 break; 44 } 45 } 46 47 if (i == 0) { 48 return; 49 } 50 51 nc -= i; 52 jschar * old = c; 53 c = xnew(jschar, nc); 54 memcpy(c, old + i, nc * sizeof(jschar)); 55 free(old); 56 57 *characters = c; 58 *num_characters = nc; 59} 60 61#ifdef HAVE_ICONV 62 63#ifdef WORDS_BIGENDIAN 64#define UTF_16_INTERNAL "UTF-16BE" 65#else 66#define UTF_16_INTERNAL "UTF-16LE" 67#endif 68 69int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { 70 assert(encoding != NULL); 71 72 iconv_t state = iconv_open(UTF_16_INTERNAL, encoding); 73 if (state == (iconv_t) -1) { 74 return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; 75 } 76 77 ICONV_CONST char * input = (char *) bytes; 78 size_t input_bytes_left = num_bytes; 79 80 jschar * c = xnew(jschar, num_bytes); 81 char * output = (char *) c; 82 size_t output_bytes_left = sizeof(jschar) * num_bytes; 83 84 size_t result = iconv(state, &input, &input_bytes_left, &output, &output_bytes_left); 85 iconv_close(state); 86 if (result == (size_t) -1) { 87 free(c); 88 return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE; 89 } 90 91 assert(input_bytes_left == 0); 92 93 size_t nc = ((jschar *) output) - c; 94 95 skip_bom(&c, &nc); 96 97 *characters = c; 98 *num_characters = nc; 99 return 0; 100} 101 102#elif HAVE_MULTIBYTETOWIDECHAR 103 104/* http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx */ 105static struct CodePage { 106 UINT value; 107 LPCSTR string; 108} code_pages[] = { 109 {37, "IBM037"}, /* IBM EBCDIC US-Canada */ 110 {437, "IBM437"}, /* OEM United States */ 111 {500, "IBM500"}, /* IBM EBCDIC International */ 112 {708, "ASMO-708"}, /* Arabic (ASMO 708) */ 113 {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ 114 {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ 115 {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ 116 {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ 117 {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ 118 {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ 119 {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ 120 {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ 121 {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ 122 {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ 123 {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ 124 {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ 125 {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ 126 {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ 127 {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ 128 {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ 129 {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ 130 {874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ 131 {875, "cp875"}, /* IBM EBCDIC Greek Modern */ 132 {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ 133 {936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ 134 {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ 135 {950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ 136 {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ 137 {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ 138 {1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ 139 {1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ 140 {1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ 141 {1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ 142 {1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ 143 {1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ 144 {1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ 145 {1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ 146 {1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ 147 {1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ 148 {1200, "utf-16"}, /* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */ 149 {1201, "unicodeFFFE"}, /* Unicode UTF-16, big endian byte order; available only to managed applications */ 150 {1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */ 151 {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ 152 {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ 153 {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ 154 {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ 155 {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ 156 {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ 157 {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ 158 {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ 159 {1361, "Johab"}, /* Korean (Johab) */ 160 {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ 161 {10001, "x-mac-japanese"}, /* Japanese (Mac) */ 162 {10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ 163 {10003, "x-mac-korean"}, /* Korean (Mac) */ 164 {10004, "x-mac-arabic"}, /* Arabic (Mac) */ 165 {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ 166 {10006, "x-mac-greek"}, /* Greek (Mac) */ 167 {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ 168 {10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ 169 {10010, "x-mac-romanian"}, /* Romanian (Mac) */ 170 {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ 171 {10021, "x-mac-thai"}, /* Thai (Mac) */ 172 {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ 173 {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ 174 {10081, "x-mac-turkish"}, /* Turkish (Mac) */ 175 {10082, "x-mac-croatian"}, /* Croatian (Mac) */ 176 {12000, "utf-32"}, /* Unicode UTF-32, little endian byte order; available only to managed applications */ 177 {12001, "utf-32BE"}, /* Unicode UTF-32, big endian byte order; available only to managed applications */ 178 {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ 179 {20001, "x-cp20001"}, /* TCA Taiwan */ 180 {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ 181 {20003, "x-cp20003"}, /* IBM5550 Taiwan */ 182 {20004, "x-cp20004"}, /* TeleText Taiwan */ 183 {20005, "x-cp20005"}, /* Wang Taiwan */ 184 {20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ 185 {20106, "x-IA5-German"}, /* IA5 German (7-bit) */ 186 {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ 187 {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ 188 {20127, "us-ascii"}, /* US-ASCII (7-bit) */ 189 {20261, "x-cp20261"}, /* T.61 */ 190 {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ 191 {20273, "IBM273"}, /* IBM EBCDIC Germany */ 192 {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ 193 {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ 194 {20280, "IBM280"}, /* IBM EBCDIC Italy */ 195 {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ 196 {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ 197 {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ 198 {20297, "IBM297"}, /* IBM EBCDIC France */ 199 {20420, "IBM420"}, /* IBM EBCDIC Arabic */ 200 {20423, "IBM423"}, /* IBM EBCDIC Greek */ 201 {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ 202 {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ 203 {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ 204 {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ 205 {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ 206 {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ 207 {20905, "IBM905"}, /* IBM EBCDIC Turkish */ 208 {20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ 209 {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ 210 {20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ 211 {20949, "x-cp20949"}, /* Korean Wansung */ 212 {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ 213 {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ 214 {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ 215 {28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ 216 {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ 217 {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ 218 {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ 219 {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ 220 {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ 221 {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ 222 {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ 223 {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ 224 {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ 225 {29001, "x-Europa"}, /* Europa 3 */ 226 {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ 227 {50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ 228 {50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ 229 {50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ 230 {50225, "iso-2022-kr"}, /* ISO 2022 Korean */ 231 {50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ 232 {51932, "euc-jp"}, /* EUC Japanese */ 233 {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ 234 {51949, "euc-kr"}, /* EUC Korean */ 235 {52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ 236 {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ 237 {57002, "x-iscii-de"}, /* ISCII Devanagari */ 238 {57003, "x-iscii-be"}, /* ISCII Bengali */ 239 {57004, "x-iscii-ta"}, /* ISCII Tamil */ 240 {57005, "x-iscii-te"}, /* ISCII Telugu */ 241 {57006, "x-iscii-as"}, /* ISCII Assamese */ 242 {57007, "x-iscii-or"}, /* ISCII Oriya */ 243 {57008, "x-iscii-ka"}, /* ISCII Kannada */ 244 {57009, "x-iscii-ma"}, /* ISCII Malayalam */ 245 {57010, "x-iscii-gu"}, /* ISCII Gujarati */ 246 {57011, "x-iscii-pa"}, /* ISCII Punjabi */ 247 {65000, "utf-7"}, /* Unicode (UTF-7) */ 248 {65001, "utf-8"}, /* Unicode (UTF-8) */ 249}; 250 251int find_code_page(const char * encoding, UINT * code_page) { 252 for (size_t i = 0; i < sizeof(code_pages) / sizeof(code_pages[0]); i++) { 253 if (strcasecmp(encoding, code_pages[i].string) == 0) { 254 *code_page = code_pages[i].value; 255 return 0; 256 } 257 } 258 return -1; 259} 260 261int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { 262 assert(encoding != NULL); 263 264 if (num_bytes == 0) { 265 *characters = xnew(jschar, 0); 266 *num_characters = 0; 267 return 0; 268 } 269 270 UINT code_page; 271 if (find_code_page(encoding, &code_page) != 0) { 272 return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; 273 } 274 275 if (num_bytes > INT_MAX) { 276 fatal("overflow"); 277 } 278 279 *characters = xnew(jschar, num_bytes); 280 281 int result = MultiByteToWideChar(code_page, MB_ERR_INVALID_CHARS, bytes, num_bytes, *characters, num_bytes); 282 if (result == 0) { 283 free(*characters); 284 return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE; 285 } 286 287 *num_characters = result; 288 skip_bom(characters, num_characters); 289 return 0; 290} 291 292#else 293 294int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { 295 assert(encoding != NULL); 296 297 if (strcasecmp(encoding, "us-ascii") != 0 && strcasecmp(encoding, "iso-8859-1") != 0 && strcasecmp(encoding, "utf-8") != 0) { 298 return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; 299 } 300 301 jschar * c = xnew(jschar, num_bytes); 302 for (size_t i = 0; i < num_bytes; i++) { 303 if (bytes[i] > 127) { 304 free(c); 305 return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; 306 } 307 c[i] = bytes[i]; 308 } 309 310 *characters = c; 311 *num_characters = num_bytes; 312 return 0; 313} 314 315#endif