PageRenderTime 60ms CodeModel.GetById 30ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/js/lib/Socket.IO-node/support/expresso/deps/jscoverage/encoding.c

http://github.com/onedayitwillmake/RealtimeMultiplayerNodeJs
C | 315 lines | 259 code | 37 blank | 19 comment | 31 complexity | 7a327ada998ce4cafc9bf50ce0c0d1f2 MD5 | raw file
  1/*
  2    encoding.c - character encoding
  3    Copyright (C) 2008 siliconforks.com
  4
  5    This program is free software; you can redistribute it and/or modify
  6    it under the terms of the GNU General Public License as published by
  7    the Free Software Foundation; either version 2 of the License, or
  8    (at your option) any later version.
  9
 10    This program is distributed in the hope that it will be useful,
 11    but WITHOUT ANY WARRANTY; without even the implied warranty of
 12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13    GNU General Public License for more details.
 14
 15    You should have received a copy of the GNU General Public License along
 16    with this program; if not, write to the Free Software Foundation, Inc.,
 17    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18*/
 19
 20#include <config.h>
 21
 22#include "encoding.h"
 23
 24#include <assert.h>
 25#include <limits.h>
 26#include <string.h>
 27
 28#ifdef HAVE_ICONV_H
 29#include <iconv.h>
 30#elif defined HAVE_WINDOWS_H
 31#include <windows.h>
 32#endif
 33
 34#include "util.h"
 35
 36static void skip_bom(jschar ** characters, size_t * num_characters) {
 37  jschar * c = *characters;
 38  size_t nc = *num_characters;
 39
 40  size_t i;
 41  for (i = 0; i < nc; i++) {
 42    if (c[i] != 0xfeff) {
 43      break;
 44    }
 45  }
 46
 47  if (i == 0) {
 48    return;
 49  }
 50
 51  nc -= i;
 52  jschar * old = c;
 53  c = xnew(jschar, nc);
 54  memcpy(c, old + i, nc * sizeof(jschar));
 55  free(old);
 56
 57  *characters = c;
 58  *num_characters = nc;
 59}
 60
 61#ifdef HAVE_ICONV
 62
 63#ifdef WORDS_BIGENDIAN
 64#define UTF_16_INTERNAL "UTF-16BE"
 65#else
 66#define UTF_16_INTERNAL "UTF-16LE"
 67#endif
 68
 69int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
 70  assert(encoding != NULL);
 71
 72  iconv_t state = iconv_open(UTF_16_INTERNAL, encoding);
 73  if (state == (iconv_t) -1) {
 74    return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
 75  }
 76
 77  ICONV_CONST char * input = (char *) bytes;
 78  size_t input_bytes_left = num_bytes;
 79
 80  jschar * c = xnew(jschar, num_bytes);
 81  char * output = (char *) c;
 82  size_t output_bytes_left = sizeof(jschar) * num_bytes;
 83
 84  size_t result = iconv(state, &input, &input_bytes_left, &output, &output_bytes_left);
 85  iconv_close(state);
 86  if (result == (size_t) -1) {
 87    free(c);
 88    return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE;
 89  }
 90
 91  assert(input_bytes_left == 0);
 92
 93  size_t nc = ((jschar *) output) - c;
 94
 95  skip_bom(&c, &nc);
 96
 97  *characters = c;
 98  *num_characters = nc;
 99  return 0;
100}
101
102#elif HAVE_MULTIBYTETOWIDECHAR
103
104/* http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx */
105static struct CodePage {
106  UINT value;
107  LPCSTR string;
108} code_pages[] = {
109  {37,		"IBM037"},			/* IBM EBCDIC US-Canada */
110  {437,		"IBM437"},			/* OEM United States */
111  {500,		"IBM500"},			/* IBM EBCDIC International */
112  {708,		"ASMO-708"},			/* Arabic (ASMO 708) */
113  {720,		"DOS-720"},			/* Arabic (Transparent ASMO); Arabic (DOS) */
114  {737,		"ibm737"},			/* OEM Greek (formerly 437G); Greek (DOS) */
115  {775,		"ibm775"},			/* OEM Baltic; Baltic (DOS) */
116  {850,		"ibm850"},			/* OEM Multilingual Latin 1; Western European (DOS) */
117  {852,		"ibm852"},			/* OEM Latin 2; Central European (DOS) */
118  {855,		"IBM855"},			/* OEM Cyrillic (primarily Russian) */
119  {857,		"ibm857"},			/* OEM Turkish; Turkish (DOS) */
120  {858,		"IBM00858"},			/* OEM Multilingual Latin 1 + Euro symbol */
121  {860,		"IBM860"},			/* OEM Portuguese; Portuguese (DOS) */
122  {861,		"ibm861"},			/* OEM Icelandic; Icelandic (DOS) */
123  {862,		"DOS-862"},			/* OEM Hebrew; Hebrew (DOS) */
124  {863,		"IBM863"},			/* OEM French Canadian; French Canadian (DOS) */
125  {864,		"IBM864"},			/* OEM Arabic; Arabic (864) */
126  {865,		"IBM865"},			/* OEM Nordic; Nordic (DOS) */
127  {866,		"cp866"},			/* OEM Russian; Cyrillic (DOS) */
128  {869,		"ibm869"},			/* OEM Modern Greek; Greek, Modern (DOS) */
129  {870,		"IBM870"},			/* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */
130  {874,		"windows-874"},			/* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */
131  {875,		"cp875"},			/* IBM EBCDIC Greek Modern */
132  {932,		"shift_jis"},			/* ANSI/OEM Japanese; Japanese (Shift-JIS) */
133  {936,		"gb2312"},			/* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */
134  {949,		"ks_c_5601-1987"},		/* ANSI/OEM Korean (Unified Hangul Code) */
135  {950,		"big5"},			/* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */
136  {1026,	"IBM1026"},			/* IBM EBCDIC Turkish (Latin 5) */
137  {1047,	"IBM01047"},			/* IBM EBCDIC Latin 1/Open System */
138  {1140,	"IBM01140"},			/* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */
139  {1141,	"IBM01141"},			/* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */
140  {1142,	"IBM01142"},			/* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */
141  {1143,	"IBM01143"},			/* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */
142  {1144,	"IBM01144"},			/* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */
143  {1145,	"IBM01145"},			/* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */
144  {1146,	"IBM01146"},			/* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */
145  {1147,	"IBM01147"},			/* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */
146  {1148,	"IBM01148"},			/* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */
147  {1149,	"IBM01149"},			/* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */
148  {1200,	"utf-16"},			/* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */
149  {1201,	"unicodeFFFE"},			/* Unicode UTF-16, big endian byte order; available only to managed applications */
150  {1250,	"windows-1250"},		/* ANSI Central European; Central European (Windows) */
151  {1251,	"windows-1251"},		/* ANSI Cyrillic; Cyrillic (Windows) */
152  {1252,	"windows-1252"},		/* ANSI Latin 1; Western European (Windows) */
153  {1253,	"windows-1253"},		/* ANSI Greek; Greek (Windows) */
154  {1254,	"windows-1254"},		/* ANSI Turkish; Turkish (Windows) */
155  {1255,	"windows-1255"},		/* ANSI Hebrew; Hebrew (Windows) */
156  {1256,	"windows-1256"},		/* ANSI Arabic; Arabic (Windows) */
157  {1257,	"windows-1257"},		/* ANSI Baltic; Baltic (Windows) */
158  {1258,	"windows-1258"},		/* ANSI/OEM Vietnamese; Vietnamese (Windows) */
159  {1361,	"Johab"},			/* Korean (Johab) */
160  {10000,	"macintosh"},			/* MAC Roman; Western European (Mac) */
161  {10001,	"x-mac-japanese"},		/* Japanese (Mac) */
162  {10002,	"x-mac-chinesetrad"},		/* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */
163  {10003,	"x-mac-korean"},		/* Korean (Mac) */
164  {10004,	"x-mac-arabic"},		/* Arabic (Mac) */
165  {10005,	"x-mac-hebrew"},		/* Hebrew (Mac) */
166  {10006,	"x-mac-greek"},			/* Greek (Mac) */
167  {10007,	"x-mac-cyrillic"},		/* Cyrillic (Mac) */
168  {10008,	"x-mac-chinesesimp"},		/* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */
169  {10010,	"x-mac-romanian"},		/* Romanian (Mac) */
170  {10017,	"x-mac-ukrainian"},		/* Ukrainian (Mac) */
171  {10021,	"x-mac-thai"},			/* Thai (Mac) */
172  {10029,	"x-mac-ce"},			/* MAC Latin 2; Central European (Mac) */
173  {10079,	"x-mac-icelandic"},		/* Icelandic (Mac) */
174  {10081,	"x-mac-turkish"},		/* Turkish (Mac) */
175  {10082,	"x-mac-croatian"},		/* Croatian (Mac) */
176  {12000,	"utf-32"},			/* Unicode UTF-32, little endian byte order; available only to managed applications */
177  {12001,	"utf-32BE"},			/* Unicode UTF-32, big endian byte order; available only to managed applications */
178  {20000,	"x-Chinese_CNS"},		/* CNS Taiwan; Chinese Traditional (CNS) */
179  {20001,	"x-cp20001"},			/* TCA Taiwan */
180  {20002,	"x_Chinese-Eten"},		/* Eten Taiwan; Chinese Traditional (Eten) */
181  {20003,	"x-cp20003"},			/* IBM5550 Taiwan */
182  {20004,	"x-cp20004"},			/* TeleText Taiwan */
183  {20005,	"x-cp20005"},			/* Wang Taiwan */
184  {20105,	"x-IA5"},			/* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */
185  {20106,	"x-IA5-German"},		/* IA5 German (7-bit) */
186  {20107,	"x-IA5-Swedish"},		/* IA5 Swedish (7-bit) */
187  {20108,	"x-IA5-Norwegian"},		/* IA5 Norwegian (7-bit) */
188  {20127,	"us-ascii"},			/* US-ASCII (7-bit) */
189  {20261,	"x-cp20261"},			/* T.61 */
190  {20269,	"x-cp20269"},			/* ISO 6937 Non-Spacing Accent */
191  {20273,	"IBM273"},			/* IBM EBCDIC Germany */
192  {20277,	"IBM277"},			/* IBM EBCDIC Denmark-Norway */
193  {20278,	"IBM278"},			/* IBM EBCDIC Finland-Sweden */
194  {20280,	"IBM280"},			/* IBM EBCDIC Italy */
195  {20284,	"IBM284"},			/* IBM EBCDIC Latin America-Spain */
196  {20285,	"IBM285"},			/* IBM EBCDIC United Kingdom */
197  {20290,	"IBM290"},			/* IBM EBCDIC Japanese Katakana Extended */
198  {20297,	"IBM297"},			/* IBM EBCDIC France */
199  {20420,	"IBM420"},			/* IBM EBCDIC Arabic */
200  {20423,	"IBM423"},			/* IBM EBCDIC Greek */
201  {20424,	"IBM424"},			/* IBM EBCDIC Hebrew */
202  {20833,	"x-EBCDIC-KoreanExtended"},	/* IBM EBCDIC Korean Extended */
203  {20838,	"IBM-Thai"},			/* IBM EBCDIC Thai */
204  {20866,	"koi8-r"},			/* Russian (KOI8-R); Cyrillic (KOI8-R) */
205  {20871,	"IBM871"},			/* IBM EBCDIC Icelandic */
206  {20880,	"IBM880"},			/* IBM EBCDIC Cyrillic Russian */
207  {20905,	"IBM905"},			/* IBM EBCDIC Turkish */
208  {20924,	"IBM00924"},			/* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */
209  {20932,	"EUC-JP"},			/* Japanese (JIS 0208-1990 and 0121-1990) */
210  {20936,	"x-cp20936"},			/* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */
211  {20949,	"x-cp20949"},			/* Korean Wansung */
212  {21025,	"cp1025"},			/* IBM EBCDIC Cyrillic Serbian-Bulgarian */
213  {21866,	"koi8-u"},			/* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */
214  {28591,	"iso-8859-1"},			/* ISO 8859-1 Latin 1; Western European (ISO) */
215  {28592,	"iso-8859-2"},			/* ISO 8859-2 Central European; Central European (ISO) */
216  {28593,	"iso-8859-3"},			/* ISO 8859-3 Latin 3 */
217  {28594,	"iso-8859-4"},			/* ISO 8859-4 Baltic */
218  {28595,	"iso-8859-5"},			/* ISO 8859-5 Cyrillic */
219  {28596,	"iso-8859-6"},			/* ISO 8859-6 Arabic */
220  {28597,	"iso-8859-7"},			/* ISO 8859-7 Greek */
221  {28598,	"iso-8859-8"},			/* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
222  {28599,	"iso-8859-9"},			/* ISO 8859-9 Turkish */
223  {28603,	"iso-8859-13"},			/* ISO 8859-13 Estonian */
224  {28605,	"iso-8859-15"},			/* ISO 8859-15 Latin 9 */
225  {29001,	"x-Europa"},			/* Europa 3 */
226  {38598,	"iso-8859-8-i"},		/* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
227  {50220,	"iso-2022-jp"},			/* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */
228  {50221,	"csISO2022JP"},			/* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */
229  {50222,	"iso-2022-jp"},			/* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */
230  {50225,	"iso-2022-kr"},			/* ISO 2022 Korean */
231  {50227,	"x-cp50227"},			/* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */
232  {51932,	"euc-jp"},			/* EUC Japanese */
233  {51936,	"EUC-CN"},			/* EUC Simplified Chinese; Chinese Simplified (EUC) */
234  {51949,	"euc-kr"},			/* EUC Korean */
235  {52936,	"hz-gb-2312"},			/* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */
236  {54936,	"GB18030"},			/* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */
237  {57002,	"x-iscii-de"},			/* ISCII Devanagari */
238  {57003,	"x-iscii-be"},			/* ISCII Bengali */
239  {57004,	"x-iscii-ta"},			/* ISCII Tamil */
240  {57005,	"x-iscii-te"},			/* ISCII Telugu */
241  {57006,	"x-iscii-as"},			/* ISCII Assamese */
242  {57007,	"x-iscii-or"},			/* ISCII Oriya */
243  {57008,	"x-iscii-ka"},			/* ISCII Kannada */
244  {57009,	"x-iscii-ma"},			/* ISCII Malayalam */
245  {57010,	"x-iscii-gu"},			/* ISCII Gujarati */
246  {57011,	"x-iscii-pa"},			/* ISCII Punjabi */
247  {65000,	"utf-7"},			/* Unicode (UTF-7) */
248  {65001,	"utf-8"},			/* Unicode (UTF-8) */
249};
250
251int find_code_page(const char * encoding, UINT * code_page) {
252  for (size_t i = 0; i < sizeof(code_pages) / sizeof(code_pages[0]); i++) {
253    if (strcasecmp(encoding, code_pages[i].string) == 0) {
254      *code_page = code_pages[i].value;
255      return 0;
256    }
257  }
258  return -1;
259}
260
261int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
262  assert(encoding != NULL);
263
264  if (num_bytes == 0) {
265    *characters = xnew(jschar, 0);
266    *num_characters = 0;
267    return 0;
268  }
269
270  UINT code_page;
271  if (find_code_page(encoding, &code_page) != 0) {
272    return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
273  }
274
275  if (num_bytes > INT_MAX) {
276    fatal("overflow");
277  }
278
279  *characters = xnew(jschar, num_bytes);
280
281  int result = MultiByteToWideChar(code_page, MB_ERR_INVALID_CHARS, bytes, num_bytes, *characters, num_bytes);
282  if (result == 0) {
283    free(*characters);
284    return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE;
285  }
286
287  *num_characters = result;
288  skip_bom(characters, num_characters);
289  return 0;
290}
291
292#else
293
294int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
295  assert(encoding != NULL);
296
297  if (strcasecmp(encoding, "us-ascii") != 0 && strcasecmp(encoding, "iso-8859-1") != 0 && strcasecmp(encoding, "utf-8") != 0) {
298    return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
299  }
300
301  jschar * c = xnew(jschar, num_bytes);
302  for (size_t i = 0; i < num_bytes; i++) {
303    if (bytes[i] > 127) {
304      free(c);
305      return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
306    }
307    c[i] = bytes[i];
308  }
309
310  *characters = c;
311  *num_characters = num_bytes;
312  return 0;
313}
314
315#endif