PageRenderTime 75ms CodeModel.GetById 37ms RepoModel.GetById 1ms app.codeStats 0ms

/libwc/detect.c

https://github.com/fujimogn/w3m
C | 544 lines | 529 code | 11 blank | 4 comment | 177 complexity | 6eb668be5265700f20ebcb2bfc3ded1e MD5 | raw file
  1. #include "wc.h"
  2. #include "iso2022.h"
  3. #include "sjis.h"
  4. #include "big5.h"
  5. #include "hz.h"
  6. #include "viet.h"
  7. #ifdef USE_UNICODE
  8. #include "utf8.h"
  9. #include "utf7.h"
  10. #endif
  11. wc_uint8 WC_DETECT_MAP[ 0x100 ] = {
  12. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  13. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  14. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  15. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  16. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  17. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  18. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  19. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  20. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  21. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  22. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  23. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  24. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  25. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  26. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  27. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  28. };
  29. #define DETECT_NORMAL 0
  30. #define DETECT_POSSIBLE 1
  31. #define DETECT_OK 2
  32. #define DETECT_BROKEN 4
  33. #define DETECT_ERROR 8
  34. #define SET_DETECT(x,y) ((x) |= (y))
  35. #define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN))
  36. void
  37. wc_create_detect_map(wc_ces ces, wc_bool esc)
  38. {
  39. static wc_ces detect_ces = WC_CES_US_ASCII;
  40. int i;
  41. if (ces != detect_ces) {
  42. if (ces & WC_CES_T_VIET) {
  43. wc_uint8 *map = NULL;
  44. switch (ces) {
  45. case WC_CES_TCVN_5712:
  46. map = wc_c0_tcvn57122_map;
  47. break;
  48. case WC_CES_VISCII_11:
  49. map = wc_c0_viscii112_map;
  50. break;
  51. case WC_CES_VPS:
  52. map = wc_c0_vps2_map;
  53. break;
  54. }
  55. for (i = 0; i < 0x20; i++)
  56. WC_DETECT_MAP[i] = map[i] ? 1 : 0;
  57. } else {
  58. for (i = 0; i < 0x20; i++)
  59. WC_DETECT_MAP[i] = 0;
  60. WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0;
  61. #ifdef USE_UNICODE
  62. WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0;
  63. #endif
  64. }
  65. detect_ces = ces;
  66. }
  67. WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0;
  68. return;
  69. }
  70. wc_ces
  71. wc_auto_detect(char *is, size_t len, wc_ces hint)
  72. {
  73. wc_uchar *p = (wc_uchar *)is;
  74. wc_uchar *ep = p + len;
  75. wc_uchar *q;
  76. wc_ces euc = 0, priv = 0;
  77. wc_status st;
  78. int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0;
  79. int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR,
  80. sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR,
  81. hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR,
  82. priv_detect = DETECT_ERROR;
  83. int possible = 0;
  84. wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE,
  85. iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE;
  86. #ifdef USE_UNICODE
  87. int utf8_state = 0;
  88. int utf8_detect = DETECT_ERROR;
  89. int utf8_next = 0;
  90. #endif
  91. wc_create_detect_map(hint, WC_TRUE);
  92. for (; p < ep && ! WC_DETECT_MAP[*p]; p++)
  93. ;
  94. if (p == ep)
  95. return hint;
  96. switch (hint) {
  97. case WC_CES_ISO_2022_JP:
  98. case WC_CES_ISO_2022_JP_2:
  99. case WC_CES_ISO_2022_JP_3:
  100. case WC_CES_EUC_JP:
  101. case WC_CES_SHIFT_JIS:
  102. case WC_CES_SHIFT_JISX0213:
  103. euc = WC_CES_EUC_JP;
  104. euc_state = WC_EUC_NOSTATE;
  105. sjis_state = WC_SJIS_NOSTATE;
  106. iso_detect = euc_detect = sjis_detect = DETECT_NORMAL;
  107. possible = 3;
  108. break;
  109. case WC_CES_ISO_2022_CN:
  110. case WC_CES_EUC_CN:
  111. euc = WC_CES_EUC_CN;
  112. euc_state = WC_EUC_NOSTATE;
  113. big5_state = WC_BIG5_NOSTATE;
  114. iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
  115. possible = 3;
  116. break;
  117. case WC_CES_EUC_TW:
  118. case WC_CES_BIG5:
  119. euc = WC_CES_EUC_TW;
  120. euc_state = WC_EUC_NOSTATE;
  121. big5_state = WC_BIG5_NOSTATE;
  122. iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
  123. possible = 3;
  124. break;
  125. case WC_CES_HZ_GB_2312:
  126. euc = WC_CES_EUC_CN;
  127. euc_state = WC_EUC_NOSTATE;
  128. hz_state = WC_HZ_NOSTATE;
  129. iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL;
  130. possible = 4;
  131. break;
  132. case WC_CES_ISO_2022_KR:
  133. case WC_CES_EUC_KR:
  134. euc = WC_CES_EUC_KR;
  135. euc_state = WC_EUC_NOSTATE;
  136. iso_detect = euc_detect = DETECT_NORMAL;
  137. possible = 3;
  138. break;
  139. #ifdef USE_UNICODE
  140. case WC_CES_UTF_8:
  141. iso_detect = DETECT_NORMAL;
  142. possible = 1;
  143. break;
  144. #endif
  145. case WC_CES_US_ASCII:
  146. iso_detect = latin_detect = DETECT_NORMAL;
  147. possible = 2;
  148. break;
  149. default:
  150. if (hint & WC_CES_T_ISO_8859) {
  151. iso_detect = latin_detect = DETECT_NORMAL;
  152. possible = 2;
  153. } else {
  154. iso_detect = priv_detect = DETECT_NORMAL;
  155. priv = hint; /* for TVCN, VISCII, VPS */
  156. possible = 2;
  157. }
  158. break;
  159. }
  160. #ifdef USE_UNICODE
  161. if (priv_detect == DETECT_ERROR) {
  162. utf8_detect = DETECT_NORMAL;
  163. possible++;
  164. }
  165. #endif
  166. wc_input_init(WC_CES_US_ASCII, &st);
  167. for (; p < ep; p++) {
  168. if (possible == 0 || (possible == 1 && ok))
  169. break;
  170. if (iso_detect != DETECT_ERROR) {
  171. switch (*p) {
  172. case WC_C_ESC:
  173. if (*(p+1) == WC_C_MBCS) {
  174. q = p;
  175. if (! wc_parse_iso2022_esc(&q, &st))
  176. break;
  177. if (st.design[0] == WC_CCS_JIS_C_6226 ||
  178. st.design[0] == WC_CCS_JIS_X_0208)
  179. ;
  180. else if (st.design[0] == WC_CCS_JIS_X_0213_1 ||
  181. st.design[0] == WC_CCS_JIS_X_0213_2)
  182. iso2022jp3 = WC_TRUE;
  183. else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W)
  184. iso2022jp2 = WC_TRUE;
  185. if (st.design[1] == WC_CCS_KS_X_1001)
  186. iso2022kr = WC_TRUE;
  187. else if (st.design[1] == WC_CCS_GB_2312 ||
  188. st.design[1] == WC_CCS_ISO_IR_165 ||
  189. st.design[1] == WC_CCS_CNS_11643_1)
  190. iso2022cn = WC_TRUE;
  191. if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W ||
  192. WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W)
  193. iso2022cn = WC_TRUE;
  194. } else if (*(p+1) == WC_C_G2_CS96) {
  195. q = p;
  196. if (! wc_parse_iso2022_esc(&q, &st))
  197. break;
  198. if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96)
  199. iso2022jp2 = WC_TRUE;
  200. } else if (*(p+1) == WC_C_CSWSR) {
  201. q = p;
  202. if (! wc_parse_iso2022_esc(&q, &st))
  203. break;
  204. possible = 0;
  205. iso_detect = DETECT_BROKEN;
  206. continue;
  207. }
  208. iso_detect = DETECT_OK;
  209. ok = WC_TRUE;
  210. break;
  211. case WC_C_SI:
  212. case WC_C_SO:
  213. iso_detect = DETECT_OK;
  214. ok = WC_TRUE;
  215. iso2022cn = WC_TRUE;
  216. iso2022kr = WC_TRUE;
  217. break;
  218. default:
  219. if (*p & 0x80) {
  220. iso_detect = DETECT_ERROR;
  221. possible--;
  222. }
  223. break;
  224. }
  225. }
  226. if (euc_detect != DETECT_ERROR) {
  227. switch (euc_state) {
  228. case WC_EUC_NOSTATE:
  229. switch (WC_ISO_MAP[*p]) {
  230. case WC_ISO_MAP_GR:
  231. euc_state = WC_EUC_MBYTE1;
  232. break;
  233. case WC_ISO_MAP_SS2:
  234. if (euc == WC_CES_EUC_JP)
  235. euc_state = WC_EUC_MBYTE1;
  236. else if (euc == WC_CES_EUC_TW)
  237. euc_state = WC_EUC_TW_SS2;
  238. else
  239. euc_detect = DETECT_ERROR;
  240. break;
  241. case WC_ISO_MAP_SS3:
  242. if (euc == WC_CES_EUC_JP &&
  243. WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR)
  244. ;
  245. else
  246. euc_detect = DETECT_ERROR;
  247. break;
  248. case WC_ISO_MAP_C1:
  249. case WC_ISO_MAP_GR96:
  250. euc_detect = DETECT_ERROR;
  251. break;
  252. }
  253. break;
  254. case WC_EUC_MBYTE1:
  255. if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) {
  256. SET_DETECT(euc_detect, DETECT_OK);
  257. ok = WC_TRUE;
  258. } else
  259. SET_BROKEN_ERROR(euc_detect);
  260. euc_state = WC_EUC_NOSTATE;
  261. break;
  262. case WC_EUC_TW_SS2:
  263. if (!( 0xa0 <= *p && *p <= 0xb0) ||
  264. WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR)
  265. euc_detect = DETECT_ERROR;
  266. euc_state = WC_EUC_NOSTATE;
  267. break;
  268. }
  269. if (euc_detect == DETECT_ERROR)
  270. possible--;
  271. }
  272. if (sjis_detect != DETECT_ERROR) {
  273. switch (sjis_state) {
  274. case WC_SJIS_NOSTATE:
  275. switch (WC_SJIS_MAP[*p]) {
  276. case WC_SJIS_MAP_SL:
  277. case WC_SJIS_MAP_SH:
  278. sjis_state = WC_SJIS_SHIFT_L;
  279. break;
  280. case WC_SJIS_MAP_SK:
  281. SET_DETECT(sjis_detect, DETECT_POSSIBLE);
  282. break;
  283. case WC_SJIS_MAP_SX:
  284. if (WcOption.use_jisx0213) {
  285. sjis_state = WC_SJIS_SHIFT_X;
  286. break;
  287. }
  288. case WC_SJIS_MAP_80:
  289. case WC_SJIS_MAP_A0:
  290. case WC_SJIS_MAP_C1:
  291. sjis_detect = DETECT_ERROR;
  292. break;
  293. }
  294. break;
  295. case WC_SJIS_SHIFT_L:
  296. if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) {
  297. SET_DETECT(sjis_detect, DETECT_OK);
  298. ok = WC_TRUE;
  299. } else
  300. SET_BROKEN_ERROR(sjis_detect);
  301. sjis_state = WC_SJIS_NOSTATE;
  302. break;
  303. case WC_SJIS_SHIFT_X:
  304. if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB)
  305. SET_DETECT(sjis_detect, DETECT_POSSIBLE);
  306. else
  307. sjis_detect = DETECT_ERROR;
  308. sjis_state = WC_SJIS_NOSTATE;
  309. break;
  310. }
  311. if (sjis_detect == DETECT_ERROR)
  312. possible--;
  313. }
  314. if (big5_detect != DETECT_ERROR) {
  315. switch (big5_state) {
  316. case WC_BIG5_NOSTATE:
  317. switch (WC_BIG5_MAP[*p]) {
  318. case WC_BIG5_MAP_UB:
  319. big5_state = WC_BIG5_MBYTE1;
  320. break;
  321. case WC_BIG5_MAP_C1:
  322. big5_detect = DETECT_ERROR;
  323. break;
  324. }
  325. break;
  326. case WC_BIG5_MBYTE1:
  327. if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) {
  328. SET_DETECT(big5_detect, DETECT_OK);
  329. ok = WC_TRUE;
  330. } else
  331. SET_BROKEN_ERROR(big5_detect);
  332. big5_state = WC_BIG5_NOSTATE;
  333. break;
  334. }
  335. if (big5_detect == DETECT_ERROR)
  336. possible--;
  337. }
  338. if (hz_detect != DETECT_ERROR) {
  339. if (*p & 0x80) {
  340. hz_detect = DETECT_ERROR;
  341. possible--;
  342. } else {
  343. switch (hz_state) {
  344. case WC_HZ_NOSTATE:
  345. if (*p == WC_C_HZ_TILDA)
  346. hz_state = WC_HZ_TILDA;
  347. break;
  348. case WC_HZ_TILDA:
  349. if (*p == WC_C_HZ_SI)
  350. hz_state = WC_HZ_MBYTE;
  351. else
  352. hz_state = WC_HZ_NOSTATE;
  353. break;
  354. case WC_HZ_TILDA_MB:
  355. if (*p == WC_C_HZ_SO)
  356. hz_state = WC_HZ_NOSTATE;
  357. else
  358. hz_state = WC_HZ_MBYTE;
  359. break;
  360. case WC_HZ_MBYTE:
  361. if (*p == WC_C_HZ_TILDA)
  362. hz_state = WC_HZ_TILDA_MB;
  363. else
  364. hz_state = WC_HZ_MBYTE1;
  365. break;
  366. case WC_HZ_MBYTE1:
  367. hz_detect = DETECT_OK;
  368. ok = WC_TRUE;
  369. hz_state = WC_HZ_NOSTATE;
  370. break;
  371. }
  372. }
  373. }
  374. if (latin_detect != DETECT_ERROR) {
  375. switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) {
  376. case WC_ISO_MAP_GR:
  377. case WC_ISO_MAP_GR96:
  378. SET_DETECT(latin_detect, DETECT_OK);
  379. ok = WC_TRUE;
  380. break;
  381. case WC_ISO_MAP_C1:
  382. latin_detect = DETECT_ERROR;
  383. break;
  384. }
  385. if (latin_detect == DETECT_ERROR)
  386. possible--;
  387. }
  388. if (priv_detect != DETECT_ERROR) {
  389. if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) {
  390. SET_DETECT(priv_detect, DETECT_OK);
  391. ok = WC_TRUE;
  392. }
  393. /*
  394. if (priv_detect == DETECT_ERROR)
  395. possible--;
  396. */
  397. }
  398. #ifdef USE_UNICODE
  399. if (utf8_detect != DETECT_ERROR) {
  400. switch (utf8_state) {
  401. case WC_UTF8_NOSTATE:
  402. switch (utf8_next = WC_UTF8_MAP[*p]) {
  403. case 1:
  404. case 8:
  405. break;
  406. case 0:
  407. case 7:
  408. utf8_detect = DETECT_ERROR;
  409. break;
  410. default:
  411. utf8_next--;
  412. utf8_state = WC_UTF8_NEXT;
  413. break;
  414. }
  415. break;
  416. case WC_UTF8_NEXT:
  417. if (WC_UTF8_MAP[*p]) {
  418. utf8_detect = DETECT_ERROR;
  419. utf8_state = WC_UTF8_NOSTATE;
  420. break;
  421. }
  422. utf8_next--;
  423. if (! utf8_next) {
  424. SET_DETECT(utf8_detect, DETECT_OK);
  425. ok = WC_TRUE;
  426. utf8_state = WC_UTF8_NOSTATE;
  427. }
  428. break;
  429. }
  430. if (utf8_detect == DETECT_ERROR)
  431. possible--;
  432. }
  433. #endif
  434. }
  435. if (iso_detect != DETECT_ERROR) {
  436. if (iso_detect == DETECT_NORMAL) {
  437. if (hz_detect == DETECT_OK)
  438. return WC_CES_HZ_GB_2312;
  439. if (priv_detect == DETECT_OK)
  440. return priv;
  441. return WC_CES_US_ASCII;
  442. }
  443. switch (euc) {
  444. case WC_CES_EUC_CN:
  445. case WC_CES_EUC_TW:
  446. if (iso2022cn)
  447. return WC_CES_ISO_2022_CN;
  448. break;
  449. case WC_CES_EUC_KR:
  450. if (iso2022kr)
  451. return WC_CES_ISO_2022_KR;
  452. break;
  453. }
  454. if (iso2022jp3)
  455. return WC_CES_ISO_2022_JP_3;
  456. if (iso2022jp2)
  457. return WC_CES_ISO_2022_JP_2;
  458. if (iso2022cn)
  459. return WC_CES_ISO_2022_CN;
  460. if (iso2022kr)
  461. return WC_CES_ISO_2022_KR;
  462. return WC_CES_ISO_2022_JP;
  463. }
  464. switch (hint) {
  465. case WC_CES_ISO_2022_JP:
  466. case WC_CES_ISO_2022_JP_2:
  467. case WC_CES_ISO_2022_JP_3:
  468. case WC_CES_ISO_2022_KR:
  469. case WC_CES_ISO_2022_CN:
  470. break;
  471. case WC_CES_EUC_JP:
  472. case WC_CES_EUC_CN:
  473. case WC_CES_EUC_TW:
  474. case WC_CES_EUC_KR:
  475. if (euc_detect != DETECT_ERROR)
  476. return hint;
  477. break;
  478. case WC_CES_SHIFT_JIS:
  479. case WC_CES_SHIFT_JISX0213:
  480. if (sjis_detect != DETECT_ERROR)
  481. return hint;
  482. break;
  483. case WC_CES_BIG5:
  484. if (big5_detect != DETECT_ERROR)
  485. return hint;
  486. break;
  487. #ifdef USE_UNICODE
  488. case WC_CES_UTF_8:
  489. return hint;
  490. #endif
  491. case WC_CES_US_ASCII:
  492. #ifdef USE_UNICODE
  493. if (utf8_detect != DETECT_ERROR)
  494. return hint;
  495. #endif
  496. if (latin_detect != DETECT_ERROR)
  497. return WC_CES_ISO_8859_1;
  498. return hint;
  499. default:
  500. if (latin_detect != DETECT_ERROR)
  501. return hint;
  502. if (priv_detect != DETECT_ERROR)
  503. return hint;
  504. #ifdef USE_UNICODE
  505. if (utf8_detect != DETECT_ERROR)
  506. return WC_CES_UTF_8;
  507. #endif
  508. return hint;
  509. }
  510. if (euc_detect == DETECT_OK)
  511. return euc;
  512. if (sjis_detect == DETECT_OK)
  513. return WC_CES_SHIFT_JIS;
  514. if (big5_detect == DETECT_OK)
  515. return WC_CES_BIG5;
  516. #ifdef USE_UNICODE
  517. if (utf8_detect == DETECT_OK)
  518. return WC_CES_UTF_8;
  519. if (sjis_detect & DETECT_POSSIBLE)
  520. return WC_CES_SHIFT_JIS;
  521. #endif
  522. if (euc_detect != DETECT_ERROR)
  523. return euc;
  524. if (sjis_detect != DETECT_ERROR)
  525. return WC_CES_SHIFT_JIS;
  526. if (big5_detect != DETECT_ERROR)
  527. return WC_CES_BIG5;
  528. #ifdef USE_UNICODE
  529. if (utf8_detect != DETECT_ERROR)
  530. return WC_CES_UTF_8;
  531. #endif
  532. return hint;
  533. }