PageRenderTime 85ms CodeModel.GetById 44ms app.highlight 36ms RepoModel.GetById 1ms app.codeStats 0ms

/libwc/detect.c

https://github.com/fujimogn/w3m
C | 544 lines | 529 code | 11 blank | 4 comment | 177 complexity | 6eb668be5265700f20ebcb2bfc3ded1e MD5 | raw file
  1
  2#include "wc.h"
  3#include "iso2022.h"
  4#include "sjis.h"
  5#include "big5.h"
  6#include "hz.h"
  7#include "viet.h"
  8#ifdef USE_UNICODE
  9#include "utf8.h"
 10#include "utf7.h"
 11#endif
 12
 13wc_uint8 WC_DETECT_MAP[ 0x100 ] = {
 14    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 15    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 16    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 17    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 18    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 19    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 20    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 21    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 
 22    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 23    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 24    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 25    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 26    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 27    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 28    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 29    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 
 30};
 31
 32#define DETECT_NORMAL	0
 33#define DETECT_POSSIBLE	1
 34#define DETECT_OK	2
 35#define DETECT_BROKEN	4
 36#define DETECT_ERROR	8
 37#define SET_DETECT(x,y) ((x) |= (y))
 38#define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN))
 39
 40void
 41wc_create_detect_map(wc_ces ces, wc_bool esc)
 42{
 43    static wc_ces detect_ces = WC_CES_US_ASCII;
 44    int i;
 45
 46    if (ces != detect_ces) {
 47	if (ces & WC_CES_T_VIET) {
 48	    wc_uint8 *map = NULL;
 49	    switch (ces) {
 50	    case WC_CES_TCVN_5712:
 51		map = wc_c0_tcvn57122_map;
 52		break;
 53	    case WC_CES_VISCII_11:
 54		map = wc_c0_viscii112_map;
 55		break;
 56	    case WC_CES_VPS:
 57		map = wc_c0_vps2_map;
 58		break;
 59	    }
 60	    for (i = 0; i < 0x20; i++)
 61		WC_DETECT_MAP[i] = map[i] ? 1 : 0;
 62	} else {
 63	    for (i = 0; i < 0x20; i++)
 64		WC_DETECT_MAP[i] = 0;
 65	    WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0;
 66#ifdef USE_UNICODE
 67	    WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0;
 68#endif
 69	}
 70	detect_ces = ces;
 71    }
 72    WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0;
 73    return;
 74}
 75
 76wc_ces
 77wc_auto_detect(char *is, size_t len, wc_ces hint)
 78{
 79    wc_uchar *p = (wc_uchar *)is;
 80    wc_uchar *ep = p + len;
 81    wc_uchar *q;
 82    wc_ces euc = 0, priv = 0;
 83    wc_status st;
 84    int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0;
 85    int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR,
 86	sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR,
 87	hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR,
 88	priv_detect = DETECT_ERROR;
 89    int possible = 0;
 90    wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE,
 91	iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE;
 92#ifdef USE_UNICODE
 93    int utf8_state = 0;
 94    int utf8_detect = DETECT_ERROR;
 95    int utf8_next = 0;
 96#endif
 97
 98    wc_create_detect_map(hint, WC_TRUE);
 99    for (; p < ep && ! WC_DETECT_MAP[*p]; p++)
100	;
101    if (p == ep)
102	return hint;
103
104    switch (hint) {
105    case WC_CES_ISO_2022_JP:
106    case WC_CES_ISO_2022_JP_2:
107    case WC_CES_ISO_2022_JP_3:
108    case WC_CES_EUC_JP:
109    case WC_CES_SHIFT_JIS:
110    case WC_CES_SHIFT_JISX0213:
111	euc = WC_CES_EUC_JP;
112	euc_state = WC_EUC_NOSTATE;
113	sjis_state = WC_SJIS_NOSTATE;
114	iso_detect = euc_detect = sjis_detect = DETECT_NORMAL;
115	possible = 3;
116	break;
117    case WC_CES_ISO_2022_CN:
118    case WC_CES_EUC_CN:
119	euc = WC_CES_EUC_CN;
120	euc_state = WC_EUC_NOSTATE;
121	big5_state = WC_BIG5_NOSTATE;
122	iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
123	possible = 3;
124	break;
125    case WC_CES_EUC_TW:
126    case WC_CES_BIG5:
127	euc = WC_CES_EUC_TW;
128	euc_state = WC_EUC_NOSTATE;
129	big5_state = WC_BIG5_NOSTATE;
130	iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
131	possible = 3;
132	break;
133    case WC_CES_HZ_GB_2312:
134	euc = WC_CES_EUC_CN;
135	euc_state = WC_EUC_NOSTATE;
136	hz_state = WC_HZ_NOSTATE;
137	iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL;
138	possible = 4;
139	break;
140    case WC_CES_ISO_2022_KR:
141    case WC_CES_EUC_KR:
142	euc = WC_CES_EUC_KR;
143	euc_state = WC_EUC_NOSTATE;
144	iso_detect = euc_detect = DETECT_NORMAL;
145	possible = 3;
146	break;
147#ifdef USE_UNICODE
148    case WC_CES_UTF_8:
149	iso_detect = DETECT_NORMAL;
150	possible = 1;
151	break;
152#endif
153    case WC_CES_US_ASCII:
154	iso_detect = latin_detect = DETECT_NORMAL;
155	possible = 2;
156	break;
157    default:
158	if (hint & WC_CES_T_ISO_8859) {
159	    iso_detect = latin_detect = DETECT_NORMAL;
160	    possible = 2;
161	} else {
162	    iso_detect = priv_detect = DETECT_NORMAL;
163	    priv = hint;	/* for TVCN, VISCII, VPS */
164	    possible = 2;
165	}
166	break;
167    }
168#ifdef USE_UNICODE
169    if (priv_detect == DETECT_ERROR) {
170	utf8_detect = DETECT_NORMAL;
171	possible++;
172    }
173#endif
174
175    wc_input_init(WC_CES_US_ASCII, &st);
176
177    for (; p < ep; p++) {
178	if (possible == 0 || (possible == 1 && ok))
179	    break;
180	if (iso_detect != DETECT_ERROR) {
181	    switch (*p) {
182	    case WC_C_ESC:
183		if (*(p+1) == WC_C_MBCS) {
184		    q = p;
185		    if (! wc_parse_iso2022_esc(&q, &st))
186			break;
187		    if (st.design[0] == WC_CCS_JIS_C_6226 ||
188			st.design[0] == WC_CCS_JIS_X_0208)
189			;
190		    else if (st.design[0] == WC_CCS_JIS_X_0213_1 ||
191			     st.design[0] == WC_CCS_JIS_X_0213_2)
192			iso2022jp3 = WC_TRUE;
193		    else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W)
194			iso2022jp2 = WC_TRUE;
195		    if (st.design[1] == WC_CCS_KS_X_1001)
196			iso2022kr = WC_TRUE;
197		    else if (st.design[1] == WC_CCS_GB_2312 ||
198			     st.design[1] == WC_CCS_ISO_IR_165 ||
199			     st.design[1] == WC_CCS_CNS_11643_1)
200			iso2022cn = WC_TRUE;
201		    if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W ||
202			WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W)
203			iso2022cn = WC_TRUE;
204		} else if (*(p+1) == WC_C_G2_CS96) {
205		    q = p;
206		    if (! wc_parse_iso2022_esc(&q, &st))
207			break;
208		    if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96)
209			iso2022jp2 = WC_TRUE;
210		} else if (*(p+1) == WC_C_CSWSR) {
211		    q = p;
212		    if (! wc_parse_iso2022_esc(&q, &st))
213			break;
214		    possible = 0;
215		    iso_detect = DETECT_BROKEN;
216		    continue;
217		}
218		iso_detect = DETECT_OK;
219		ok = WC_TRUE;
220		break;
221	    case WC_C_SI:
222	    case WC_C_SO:
223		iso_detect = DETECT_OK;
224		ok = WC_TRUE;
225		iso2022cn = WC_TRUE;
226		iso2022kr = WC_TRUE;
227		break;
228	    default:
229		if (*p & 0x80) {
230		    iso_detect = DETECT_ERROR;
231		    possible--;
232		}
233		break;
234	    }
235	}
236	if (euc_detect != DETECT_ERROR) {
237	    switch (euc_state) {
238	    case WC_EUC_NOSTATE:
239		switch (WC_ISO_MAP[*p]) {
240		case WC_ISO_MAP_GR:
241		    euc_state = WC_EUC_MBYTE1;
242		    break;
243		case WC_ISO_MAP_SS2:
244		    if (euc == WC_CES_EUC_JP)
245			euc_state = WC_EUC_MBYTE1;
246		    else if (euc == WC_CES_EUC_TW)
247			euc_state = WC_EUC_TW_SS2;
248		    else
249			euc_detect = DETECT_ERROR;
250		    break;
251		case WC_ISO_MAP_SS3:
252		    if (euc == WC_CES_EUC_JP &&
253			WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR)
254			;
255		    else
256			euc_detect = DETECT_ERROR;
257		    break;
258		case WC_ISO_MAP_C1:
259		case WC_ISO_MAP_GR96:
260		    euc_detect = DETECT_ERROR;
261		    break;
262		}
263		break;
264	    case WC_EUC_MBYTE1:
265		if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) {
266		    SET_DETECT(euc_detect, DETECT_OK);
267		    ok = WC_TRUE;
268		} else
269		    SET_BROKEN_ERROR(euc_detect);
270		euc_state = WC_EUC_NOSTATE;
271		break;
272	    case WC_EUC_TW_SS2:
273		if (!( 0xa0 <= *p && *p <= 0xb0) ||
274		    WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR)
275		    euc_detect = DETECT_ERROR;
276		euc_state = WC_EUC_NOSTATE;
277		break;
278	    }
279	    if (euc_detect == DETECT_ERROR)
280		possible--;
281	}
282	if (sjis_detect != DETECT_ERROR) {
283	    switch (sjis_state) {
284	    case WC_SJIS_NOSTATE:
285		switch (WC_SJIS_MAP[*p]) {
286		case WC_SJIS_MAP_SL:
287		case WC_SJIS_MAP_SH:
288		    sjis_state = WC_SJIS_SHIFT_L;
289		    break;
290		case WC_SJIS_MAP_SK:
291		    SET_DETECT(sjis_detect, DETECT_POSSIBLE);
292		    break;
293		case WC_SJIS_MAP_SX:
294		    if (WcOption.use_jisx0213) {
295			sjis_state = WC_SJIS_SHIFT_X;
296			break;
297		    }
298		case WC_SJIS_MAP_80:
299		case WC_SJIS_MAP_A0:
300		case WC_SJIS_MAP_C1:
301		    sjis_detect = DETECT_ERROR;
302		    break;
303		}
304		break;
305	    case WC_SJIS_SHIFT_L:
306		if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) {
307		    SET_DETECT(sjis_detect, DETECT_OK);
308		    ok = WC_TRUE;
309		} else
310		    SET_BROKEN_ERROR(sjis_detect);
311		sjis_state = WC_SJIS_NOSTATE;
312		break;
313	    case WC_SJIS_SHIFT_X:
314		if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB)
315		    SET_DETECT(sjis_detect, DETECT_POSSIBLE);
316		else
317		    sjis_detect = DETECT_ERROR;
318		sjis_state = WC_SJIS_NOSTATE;
319		break;
320	    }
321	    if (sjis_detect == DETECT_ERROR)
322		possible--;
323	}
324	if (big5_detect != DETECT_ERROR) {
325	    switch (big5_state) {
326	    case WC_BIG5_NOSTATE:
327		switch (WC_BIG5_MAP[*p]) {
328		case WC_BIG5_MAP_UB:
329		    big5_state = WC_BIG5_MBYTE1;
330		    break;
331		case WC_BIG5_MAP_C1:
332		    big5_detect = DETECT_ERROR;
333		    break;
334		}
335		break;
336	    case WC_BIG5_MBYTE1:
337		if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) {
338		    SET_DETECT(big5_detect, DETECT_OK);
339		    ok = WC_TRUE;
340		} else
341		    SET_BROKEN_ERROR(big5_detect);
342		big5_state = WC_BIG5_NOSTATE;
343		break;
344	    }
345	    if (big5_detect == DETECT_ERROR)
346		possible--;
347	}
348	if (hz_detect != DETECT_ERROR) {
349	  if (*p & 0x80) {
350		hz_detect = DETECT_ERROR;
351		possible--;
352	  } else {
353	    switch (hz_state) {
354	    case WC_HZ_NOSTATE:
355		if (*p == WC_C_HZ_TILDA)
356		    hz_state = WC_HZ_TILDA;
357		break;
358	    case WC_HZ_TILDA:
359		if (*p == WC_C_HZ_SI)
360		    hz_state = WC_HZ_MBYTE;
361		else
362		    hz_state = WC_HZ_NOSTATE;
363		break;
364	    case WC_HZ_TILDA_MB:
365		if (*p == WC_C_HZ_SO)
366		    hz_state = WC_HZ_NOSTATE;
367		else
368		    hz_state = WC_HZ_MBYTE;
369		break;
370	    case WC_HZ_MBYTE:
371		if (*p == WC_C_HZ_TILDA)
372		    hz_state = WC_HZ_TILDA_MB;
373		else
374		    hz_state = WC_HZ_MBYTE1;
375		break;
376	    case WC_HZ_MBYTE1:
377		hz_detect = DETECT_OK;
378		ok = WC_TRUE;
379		hz_state = WC_HZ_NOSTATE;
380		break;
381	    }
382	  }
383	}
384	if (latin_detect != DETECT_ERROR) {
385	    switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) {
386	    case WC_ISO_MAP_GR:
387	    case WC_ISO_MAP_GR96:
388		SET_DETECT(latin_detect, DETECT_OK);
389		ok = WC_TRUE;
390		break;
391	    case WC_ISO_MAP_C1:
392		latin_detect = DETECT_ERROR;
393		break;
394	    }
395	    if (latin_detect == DETECT_ERROR)
396		possible--;
397	}
398	if (priv_detect != DETECT_ERROR) {
399	    if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) {
400		SET_DETECT(priv_detect, DETECT_OK);
401		ok = WC_TRUE;
402	    }
403/*
404	    if (priv_detect == DETECT_ERROR)
405		possible--;
406*/
407	}
408#ifdef USE_UNICODE
409	if (utf8_detect != DETECT_ERROR) {
410	    switch (utf8_state) {
411	    case WC_UTF8_NOSTATE:
412		switch (utf8_next = WC_UTF8_MAP[*p]) {
413		case 1:
414		case 8:
415		    break;
416		case 0:
417		case 7:
418		    utf8_detect = DETECT_ERROR;
419		    break;
420		default:
421		    utf8_next--;
422		    utf8_state = WC_UTF8_NEXT;
423		    break;
424		}
425		break;
426	    case WC_UTF8_NEXT:
427		if (WC_UTF8_MAP[*p]) {
428		    utf8_detect = DETECT_ERROR;
429		    utf8_state = WC_UTF8_NOSTATE;
430		    break;
431		}
432		utf8_next--;
433		if (! utf8_next) {
434		    SET_DETECT(utf8_detect, DETECT_OK);
435		    ok = WC_TRUE;
436		    utf8_state = WC_UTF8_NOSTATE;
437		}
438		break;
439	    }
440	    if (utf8_detect == DETECT_ERROR)
441		possible--;
442	}
443#endif
444    }
445
446    if (iso_detect != DETECT_ERROR) {
447	if (iso_detect == DETECT_NORMAL) {
448	   if (hz_detect == DETECT_OK)
449		return WC_CES_HZ_GB_2312;
450	   if (priv_detect == DETECT_OK)
451		return priv;
452	   return WC_CES_US_ASCII;
453	}
454	switch (euc) {
455	case WC_CES_EUC_CN:
456	case WC_CES_EUC_TW:
457	    if (iso2022cn)
458		return WC_CES_ISO_2022_CN;
459	    break;
460	case WC_CES_EUC_KR:
461	    if (iso2022kr)
462		return WC_CES_ISO_2022_KR;
463	    break;
464	}
465	if (iso2022jp3)
466	    return WC_CES_ISO_2022_JP_3;
467	if (iso2022jp2)
468	    return WC_CES_ISO_2022_JP_2;
469	if (iso2022cn)
470	    return WC_CES_ISO_2022_CN;
471	if (iso2022kr)
472	    return WC_CES_ISO_2022_KR;
473	return WC_CES_ISO_2022_JP;
474    }
475    switch (hint) {
476    case WC_CES_ISO_2022_JP:
477    case WC_CES_ISO_2022_JP_2:
478    case WC_CES_ISO_2022_JP_3:
479    case WC_CES_ISO_2022_KR:
480    case WC_CES_ISO_2022_CN:
481	break;
482    case WC_CES_EUC_JP:
483    case WC_CES_EUC_CN:
484    case WC_CES_EUC_TW:
485    case WC_CES_EUC_KR:
486	if (euc_detect != DETECT_ERROR)
487	    return hint;
488	break;
489    case WC_CES_SHIFT_JIS:
490    case WC_CES_SHIFT_JISX0213:
491	if (sjis_detect != DETECT_ERROR)
492	    return hint;
493	break;
494    case WC_CES_BIG5:
495	if (big5_detect != DETECT_ERROR)
496	    return hint;
497	break;
498#ifdef USE_UNICODE
499    case WC_CES_UTF_8:
500	return hint;
501#endif
502    case WC_CES_US_ASCII:
503#ifdef USE_UNICODE
504	if (utf8_detect != DETECT_ERROR)
505	    return hint;
506#endif
507	if (latin_detect != DETECT_ERROR)
508	    return WC_CES_ISO_8859_1;
509	return hint;
510    default:
511	if (latin_detect != DETECT_ERROR)
512	    return hint;
513	if (priv_detect != DETECT_ERROR)
514	    return hint;
515#ifdef USE_UNICODE
516	if (utf8_detect != DETECT_ERROR)
517	    return WC_CES_UTF_8;
518#endif
519	return hint;
520    }
521    if (euc_detect == DETECT_OK)
522	return euc;
523    if (sjis_detect == DETECT_OK)
524	return WC_CES_SHIFT_JIS;
525    if (big5_detect == DETECT_OK)
526	return WC_CES_BIG5;
527#ifdef USE_UNICODE
528    if (utf8_detect == DETECT_OK)
529	return WC_CES_UTF_8;
530    if (sjis_detect & DETECT_POSSIBLE)
531	return WC_CES_SHIFT_JIS;
532#endif
533    if (euc_detect != DETECT_ERROR)
534	return euc;
535    if (sjis_detect != DETECT_ERROR)
536	return WC_CES_SHIFT_JIS;
537    if (big5_detect != DETECT_ERROR)
538	return WC_CES_BIG5;
539#ifdef USE_UNICODE
540    if (utf8_detect != DETECT_ERROR)
541	return WC_CES_UTF_8;
542#endif
543    return hint;
544}