/libwc/detect.c
C | 544 lines | 529 code | 11 blank | 4 comment | 177 complexity | 6eb668be5265700f20ebcb2bfc3ded1e MD5 | raw file
1
2#include "wc.h"
3#include "iso2022.h"
4#include "sjis.h"
5#include "big5.h"
6#include "hz.h"
7#include "viet.h"
8#ifdef USE_UNICODE
9#include "utf8.h"
10#include "utf7.h"
11#endif
12
13wc_uint8 WC_DETECT_MAP[ 0x100 ] = {
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30};
31
32#define DETECT_NORMAL 0
33#define DETECT_POSSIBLE 1
34#define DETECT_OK 2
35#define DETECT_BROKEN 4
36#define DETECT_ERROR 8
37#define SET_DETECT(x,y) ((x) |= (y))
38#define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN))
39
40void
41wc_create_detect_map(wc_ces ces, wc_bool esc)
42{
43 static wc_ces detect_ces = WC_CES_US_ASCII;
44 int i;
45
46 if (ces != detect_ces) {
47 if (ces & WC_CES_T_VIET) {
48 wc_uint8 *map = NULL;
49 switch (ces) {
50 case WC_CES_TCVN_5712:
51 map = wc_c0_tcvn57122_map;
52 break;
53 case WC_CES_VISCII_11:
54 map = wc_c0_viscii112_map;
55 break;
56 case WC_CES_VPS:
57 map = wc_c0_vps2_map;
58 break;
59 }
60 for (i = 0; i < 0x20; i++)
61 WC_DETECT_MAP[i] = map[i] ? 1 : 0;
62 } else {
63 for (i = 0; i < 0x20; i++)
64 WC_DETECT_MAP[i] = 0;
65 WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0;
66#ifdef USE_UNICODE
67 WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0;
68#endif
69 }
70 detect_ces = ces;
71 }
72 WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0;
73 return;
74}
75
76wc_ces
77wc_auto_detect(char *is, size_t len, wc_ces hint)
78{
79 wc_uchar *p = (wc_uchar *)is;
80 wc_uchar *ep = p + len;
81 wc_uchar *q;
82 wc_ces euc = 0, priv = 0;
83 wc_status st;
84 int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0;
85 int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR,
86 sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR,
87 hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR,
88 priv_detect = DETECT_ERROR;
89 int possible = 0;
90 wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE,
91 iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE;
92#ifdef USE_UNICODE
93 int utf8_state = 0;
94 int utf8_detect = DETECT_ERROR;
95 int utf8_next = 0;
96#endif
97
98 wc_create_detect_map(hint, WC_TRUE);
99 for (; p < ep && ! WC_DETECT_MAP[*p]; p++)
100 ;
101 if (p == ep)
102 return hint;
103
104 switch (hint) {
105 case WC_CES_ISO_2022_JP:
106 case WC_CES_ISO_2022_JP_2:
107 case WC_CES_ISO_2022_JP_3:
108 case WC_CES_EUC_JP:
109 case WC_CES_SHIFT_JIS:
110 case WC_CES_SHIFT_JISX0213:
111 euc = WC_CES_EUC_JP;
112 euc_state = WC_EUC_NOSTATE;
113 sjis_state = WC_SJIS_NOSTATE;
114 iso_detect = euc_detect = sjis_detect = DETECT_NORMAL;
115 possible = 3;
116 break;
117 case WC_CES_ISO_2022_CN:
118 case WC_CES_EUC_CN:
119 euc = WC_CES_EUC_CN;
120 euc_state = WC_EUC_NOSTATE;
121 big5_state = WC_BIG5_NOSTATE;
122 iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
123 possible = 3;
124 break;
125 case WC_CES_EUC_TW:
126 case WC_CES_BIG5:
127 euc = WC_CES_EUC_TW;
128 euc_state = WC_EUC_NOSTATE;
129 big5_state = WC_BIG5_NOSTATE;
130 iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
131 possible = 3;
132 break;
133 case WC_CES_HZ_GB_2312:
134 euc = WC_CES_EUC_CN;
135 euc_state = WC_EUC_NOSTATE;
136 hz_state = WC_HZ_NOSTATE;
137 iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL;
138 possible = 4;
139 break;
140 case WC_CES_ISO_2022_KR:
141 case WC_CES_EUC_KR:
142 euc = WC_CES_EUC_KR;
143 euc_state = WC_EUC_NOSTATE;
144 iso_detect = euc_detect = DETECT_NORMAL;
145 possible = 3;
146 break;
147#ifdef USE_UNICODE
148 case WC_CES_UTF_8:
149 iso_detect = DETECT_NORMAL;
150 possible = 1;
151 break;
152#endif
153 case WC_CES_US_ASCII:
154 iso_detect = latin_detect = DETECT_NORMAL;
155 possible = 2;
156 break;
157 default:
158 if (hint & WC_CES_T_ISO_8859) {
159 iso_detect = latin_detect = DETECT_NORMAL;
160 possible = 2;
161 } else {
162 iso_detect = priv_detect = DETECT_NORMAL;
163 priv = hint; /* for TVCN, VISCII, VPS */
164 possible = 2;
165 }
166 break;
167 }
168#ifdef USE_UNICODE
169 if (priv_detect == DETECT_ERROR) {
170 utf8_detect = DETECT_NORMAL;
171 possible++;
172 }
173#endif
174
175 wc_input_init(WC_CES_US_ASCII, &st);
176
177 for (; p < ep; p++) {
178 if (possible == 0 || (possible == 1 && ok))
179 break;
180 if (iso_detect != DETECT_ERROR) {
181 switch (*p) {
182 case WC_C_ESC:
183 if (*(p+1) == WC_C_MBCS) {
184 q = p;
185 if (! wc_parse_iso2022_esc(&q, &st))
186 break;
187 if (st.design[0] == WC_CCS_JIS_C_6226 ||
188 st.design[0] == WC_CCS_JIS_X_0208)
189 ;
190 else if (st.design[0] == WC_CCS_JIS_X_0213_1 ||
191 st.design[0] == WC_CCS_JIS_X_0213_2)
192 iso2022jp3 = WC_TRUE;
193 else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W)
194 iso2022jp2 = WC_TRUE;
195 if (st.design[1] == WC_CCS_KS_X_1001)
196 iso2022kr = WC_TRUE;
197 else if (st.design[1] == WC_CCS_GB_2312 ||
198 st.design[1] == WC_CCS_ISO_IR_165 ||
199 st.design[1] == WC_CCS_CNS_11643_1)
200 iso2022cn = WC_TRUE;
201 if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W ||
202 WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W)
203 iso2022cn = WC_TRUE;
204 } else if (*(p+1) == WC_C_G2_CS96) {
205 q = p;
206 if (! wc_parse_iso2022_esc(&q, &st))
207 break;
208 if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96)
209 iso2022jp2 = WC_TRUE;
210 } else if (*(p+1) == WC_C_CSWSR) {
211 q = p;
212 if (! wc_parse_iso2022_esc(&q, &st))
213 break;
214 possible = 0;
215 iso_detect = DETECT_BROKEN;
216 continue;
217 }
218 iso_detect = DETECT_OK;
219 ok = WC_TRUE;
220 break;
221 case WC_C_SI:
222 case WC_C_SO:
223 iso_detect = DETECT_OK;
224 ok = WC_TRUE;
225 iso2022cn = WC_TRUE;
226 iso2022kr = WC_TRUE;
227 break;
228 default:
229 if (*p & 0x80) {
230 iso_detect = DETECT_ERROR;
231 possible--;
232 }
233 break;
234 }
235 }
236 if (euc_detect != DETECT_ERROR) {
237 switch (euc_state) {
238 case WC_EUC_NOSTATE:
239 switch (WC_ISO_MAP[*p]) {
240 case WC_ISO_MAP_GR:
241 euc_state = WC_EUC_MBYTE1;
242 break;
243 case WC_ISO_MAP_SS2:
244 if (euc == WC_CES_EUC_JP)
245 euc_state = WC_EUC_MBYTE1;
246 else if (euc == WC_CES_EUC_TW)
247 euc_state = WC_EUC_TW_SS2;
248 else
249 euc_detect = DETECT_ERROR;
250 break;
251 case WC_ISO_MAP_SS3:
252 if (euc == WC_CES_EUC_JP &&
253 WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR)
254 ;
255 else
256 euc_detect = DETECT_ERROR;
257 break;
258 case WC_ISO_MAP_C1:
259 case WC_ISO_MAP_GR96:
260 euc_detect = DETECT_ERROR;
261 break;
262 }
263 break;
264 case WC_EUC_MBYTE1:
265 if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) {
266 SET_DETECT(euc_detect, DETECT_OK);
267 ok = WC_TRUE;
268 } else
269 SET_BROKEN_ERROR(euc_detect);
270 euc_state = WC_EUC_NOSTATE;
271 break;
272 case WC_EUC_TW_SS2:
273 if (!( 0xa0 <= *p && *p <= 0xb0) ||
274 WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR)
275 euc_detect = DETECT_ERROR;
276 euc_state = WC_EUC_NOSTATE;
277 break;
278 }
279 if (euc_detect == DETECT_ERROR)
280 possible--;
281 }
282 if (sjis_detect != DETECT_ERROR) {
283 switch (sjis_state) {
284 case WC_SJIS_NOSTATE:
285 switch (WC_SJIS_MAP[*p]) {
286 case WC_SJIS_MAP_SL:
287 case WC_SJIS_MAP_SH:
288 sjis_state = WC_SJIS_SHIFT_L;
289 break;
290 case WC_SJIS_MAP_SK:
291 SET_DETECT(sjis_detect, DETECT_POSSIBLE);
292 break;
293 case WC_SJIS_MAP_SX:
294 if (WcOption.use_jisx0213) {
295 sjis_state = WC_SJIS_SHIFT_X;
296 break;
297 }
298 case WC_SJIS_MAP_80:
299 case WC_SJIS_MAP_A0:
300 case WC_SJIS_MAP_C1:
301 sjis_detect = DETECT_ERROR;
302 break;
303 }
304 break;
305 case WC_SJIS_SHIFT_L:
306 if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) {
307 SET_DETECT(sjis_detect, DETECT_OK);
308 ok = WC_TRUE;
309 } else
310 SET_BROKEN_ERROR(sjis_detect);
311 sjis_state = WC_SJIS_NOSTATE;
312 break;
313 case WC_SJIS_SHIFT_X:
314 if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB)
315 SET_DETECT(sjis_detect, DETECT_POSSIBLE);
316 else
317 sjis_detect = DETECT_ERROR;
318 sjis_state = WC_SJIS_NOSTATE;
319 break;
320 }
321 if (sjis_detect == DETECT_ERROR)
322 possible--;
323 }
324 if (big5_detect != DETECT_ERROR) {
325 switch (big5_state) {
326 case WC_BIG5_NOSTATE:
327 switch (WC_BIG5_MAP[*p]) {
328 case WC_BIG5_MAP_UB:
329 big5_state = WC_BIG5_MBYTE1;
330 break;
331 case WC_BIG5_MAP_C1:
332 big5_detect = DETECT_ERROR;
333 break;
334 }
335 break;
336 case WC_BIG5_MBYTE1:
337 if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) {
338 SET_DETECT(big5_detect, DETECT_OK);
339 ok = WC_TRUE;
340 } else
341 SET_BROKEN_ERROR(big5_detect);
342 big5_state = WC_BIG5_NOSTATE;
343 break;
344 }
345 if (big5_detect == DETECT_ERROR)
346 possible--;
347 }
348 if (hz_detect != DETECT_ERROR) {
349 if (*p & 0x80) {
350 hz_detect = DETECT_ERROR;
351 possible--;
352 } else {
353 switch (hz_state) {
354 case WC_HZ_NOSTATE:
355 if (*p == WC_C_HZ_TILDA)
356 hz_state = WC_HZ_TILDA;
357 break;
358 case WC_HZ_TILDA:
359 if (*p == WC_C_HZ_SI)
360 hz_state = WC_HZ_MBYTE;
361 else
362 hz_state = WC_HZ_NOSTATE;
363 break;
364 case WC_HZ_TILDA_MB:
365 if (*p == WC_C_HZ_SO)
366 hz_state = WC_HZ_NOSTATE;
367 else
368 hz_state = WC_HZ_MBYTE;
369 break;
370 case WC_HZ_MBYTE:
371 if (*p == WC_C_HZ_TILDA)
372 hz_state = WC_HZ_TILDA_MB;
373 else
374 hz_state = WC_HZ_MBYTE1;
375 break;
376 case WC_HZ_MBYTE1:
377 hz_detect = DETECT_OK;
378 ok = WC_TRUE;
379 hz_state = WC_HZ_NOSTATE;
380 break;
381 }
382 }
383 }
384 if (latin_detect != DETECT_ERROR) {
385 switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) {
386 case WC_ISO_MAP_GR:
387 case WC_ISO_MAP_GR96:
388 SET_DETECT(latin_detect, DETECT_OK);
389 ok = WC_TRUE;
390 break;
391 case WC_ISO_MAP_C1:
392 latin_detect = DETECT_ERROR;
393 break;
394 }
395 if (latin_detect == DETECT_ERROR)
396 possible--;
397 }
398 if (priv_detect != DETECT_ERROR) {
399 if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) {
400 SET_DETECT(priv_detect, DETECT_OK);
401 ok = WC_TRUE;
402 }
403/*
404 if (priv_detect == DETECT_ERROR)
405 possible--;
406*/
407 }
408#ifdef USE_UNICODE
409 if (utf8_detect != DETECT_ERROR) {
410 switch (utf8_state) {
411 case WC_UTF8_NOSTATE:
412 switch (utf8_next = WC_UTF8_MAP[*p]) {
413 case 1:
414 case 8:
415 break;
416 case 0:
417 case 7:
418 utf8_detect = DETECT_ERROR;
419 break;
420 default:
421 utf8_next--;
422 utf8_state = WC_UTF8_NEXT;
423 break;
424 }
425 break;
426 case WC_UTF8_NEXT:
427 if (WC_UTF8_MAP[*p]) {
428 utf8_detect = DETECT_ERROR;
429 utf8_state = WC_UTF8_NOSTATE;
430 break;
431 }
432 utf8_next--;
433 if (! utf8_next) {
434 SET_DETECT(utf8_detect, DETECT_OK);
435 ok = WC_TRUE;
436 utf8_state = WC_UTF8_NOSTATE;
437 }
438 break;
439 }
440 if (utf8_detect == DETECT_ERROR)
441 possible--;
442 }
443#endif
444 }
445
446 if (iso_detect != DETECT_ERROR) {
447 if (iso_detect == DETECT_NORMAL) {
448 if (hz_detect == DETECT_OK)
449 return WC_CES_HZ_GB_2312;
450 if (priv_detect == DETECT_OK)
451 return priv;
452 return WC_CES_US_ASCII;
453 }
454 switch (euc) {
455 case WC_CES_EUC_CN:
456 case WC_CES_EUC_TW:
457 if (iso2022cn)
458 return WC_CES_ISO_2022_CN;
459 break;
460 case WC_CES_EUC_KR:
461 if (iso2022kr)
462 return WC_CES_ISO_2022_KR;
463 break;
464 }
465 if (iso2022jp3)
466 return WC_CES_ISO_2022_JP_3;
467 if (iso2022jp2)
468 return WC_CES_ISO_2022_JP_2;
469 if (iso2022cn)
470 return WC_CES_ISO_2022_CN;
471 if (iso2022kr)
472 return WC_CES_ISO_2022_KR;
473 return WC_CES_ISO_2022_JP;
474 }
475 switch (hint) {
476 case WC_CES_ISO_2022_JP:
477 case WC_CES_ISO_2022_JP_2:
478 case WC_CES_ISO_2022_JP_3:
479 case WC_CES_ISO_2022_KR:
480 case WC_CES_ISO_2022_CN:
481 break;
482 case WC_CES_EUC_JP:
483 case WC_CES_EUC_CN:
484 case WC_CES_EUC_TW:
485 case WC_CES_EUC_KR:
486 if (euc_detect != DETECT_ERROR)
487 return hint;
488 break;
489 case WC_CES_SHIFT_JIS:
490 case WC_CES_SHIFT_JISX0213:
491 if (sjis_detect != DETECT_ERROR)
492 return hint;
493 break;
494 case WC_CES_BIG5:
495 if (big5_detect != DETECT_ERROR)
496 return hint;
497 break;
498#ifdef USE_UNICODE
499 case WC_CES_UTF_8:
500 return hint;
501#endif
502 case WC_CES_US_ASCII:
503#ifdef USE_UNICODE
504 if (utf8_detect != DETECT_ERROR)
505 return hint;
506#endif
507 if (latin_detect != DETECT_ERROR)
508 return WC_CES_ISO_8859_1;
509 return hint;
510 default:
511 if (latin_detect != DETECT_ERROR)
512 return hint;
513 if (priv_detect != DETECT_ERROR)
514 return hint;
515#ifdef USE_UNICODE
516 if (utf8_detect != DETECT_ERROR)
517 return WC_CES_UTF_8;
518#endif
519 return hint;
520 }
521 if (euc_detect == DETECT_OK)
522 return euc;
523 if (sjis_detect == DETECT_OK)
524 return WC_CES_SHIFT_JIS;
525 if (big5_detect == DETECT_OK)
526 return WC_CES_BIG5;
527#ifdef USE_UNICODE
528 if (utf8_detect == DETECT_OK)
529 return WC_CES_UTF_8;
530 if (sjis_detect & DETECT_POSSIBLE)
531 return WC_CES_SHIFT_JIS;
532#endif
533 if (euc_detect != DETECT_ERROR)
534 return euc;
535 if (sjis_detect != DETECT_ERROR)
536 return WC_CES_SHIFT_JIS;
537 if (big5_detect != DETECT_ERROR)
538 return WC_CES_BIG5;
539#ifdef USE_UNICODE
540 if (utf8_detect != DETECT_ERROR)
541 return WC_CES_UTF_8;
542#endif
543 return hint;
544}