PageRenderTime 68ms CodeModel.GetById 32ms RepoModel.GetById 1ms app.codeStats 0ms

/src/wml_aux/tidy/tidy.c

https://bitbucket.org/shlomif/website-meta-language
C | 1137 lines | 801 code | 201 blank | 135 comment | 310 complexity | 4ceff856078aa184f3b438434010f485 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0, LGPL-2.0
  1. /*
  2. tidy.c - HTML parser and pretty printer
  3. Copyright (c) 1998-2000 World Wide Web Consortium
  4. (Massachusetts Institute of Technology, Institut National de
  5. Recherche en Informatique et en Automatique, Keio University).
  6. All Rights Reserved.
  7. Contributing Author(s):
  8. Dave Raggett <dsr@w3.org>
  9. The contributing author(s) would like to thank all those who
  10. helped with testing, bug fixes, and patience. This wouldn't
  11. have been possible without all of you.
  12. COPYRIGHT NOTICE:
  13. This software and documentation is provided "as is," and
  14. the copyright holders and contributing author(s) make no
  15. representations or warranties, express or implied, including
  16. but not limited to, warranties of merchantability or fitness
  17. for any particular purpose or that the use of the software or
  18. documentation will not infringe any third party patents,
  19. copyrights, trademarks or other rights.
  20. The copyright holders and contributing author(s) will not be
  21. liable for any direct, indirect, special or consequential damages
  22. arising out of any use of the software or documentation, even if
  23. advised of the possibility of such damage.
  24. Permission is hereby granted to use, copy, modify, and distribute
  25. this source code, or portions hereof, documentation and executables,
  26. for any purpose, without fee, subject to the following restrictions:
  27. 1. The origin of this source code must not be misrepresented.
  28. 2. Altered versions must be plainly marked as such and must
  29. not be misrepresented as being the original source.
  30. 3. This Copyright notice may not be removed or altered from any
  31. source or altered source distribution.
  32. The copyright holders and contributing author(s) specifically
  33. permit, without fee, and encourage the use of this source code
  34. as a component for supporting the Hypertext Markup Language in
  35. commercial products. If you use this source code in a product,
  36. acknowledgment is not required but would be appreciated.
  37. */
  38. #include "platform.h"
  39. #include "html.h"
  40. void InitTidy(void);
  41. void DeInitTidy(void);
  42. extern char *release_date;
  43. Bool debug_flag = no;
  44. Node *debug_element = null;
  45. Lexer *debug_lexer = null;
  46. uint totalerrors = 0;
  47. uint totalwarnings = 0;
  48. uint optionerrors = 0;
  49. FILE *errout; /* set to stderr or stdout */
  50. FILE *input;
  51. /* Mapping for Windows Western character set (128-159) to Unicode */
  52. int Win2Unicode[32] =
  53. {
  54. 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  55. 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
  56. 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  57. 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
  58. };
  59. /*
  60. John Love-Jensen contributed this table for mapping MacRoman
  61. character set to Unicode
  62. */
  63. int Mac2Unicode[256] =
  64. {
  65. 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
  66. 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
  67. 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
  68. 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
  69. 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
  70. 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
  71. 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
  72. 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
  73. 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
  74. 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
  75. 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
  76. 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
  77. 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
  78. 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
  79. 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
  80. 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
  81. /* x7F = DEL */
  82. 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
  83. 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
  84. 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
  85. 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
  86. 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
  87. 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
  88. 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
  89. 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
  90. 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
  91. 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
  92. 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
  93. 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
  94. 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
  95. 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
  96. /* xF0 = Apple Logo */
  97. 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
  98. 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
  99. };
  100. void FatalError(char *msg)
  101. {
  102. fprintf(stderr, "Fatal error: %s\n", msg);
  103. DeInitTidy();
  104. if (input && input != stdin)
  105. fclose(input);
  106. /* 2 signifies a serious error */
  107. exit(2);
  108. }
  109. void *MemAlloc(uint size)
  110. {
  111. void *p;
  112. p = malloc(size);
  113. if (!p)
  114. FatalError("Out of memory!");
  115. return p;
  116. }
  117. void *MemRealloc(void *mem, uint newsize)
  118. {
  119. void *p;
  120. if (mem == (void *)null)
  121. return MemAlloc(newsize);
  122. p = realloc(mem, newsize);
  123. if (!p)
  124. FatalError("Out of memory!");
  125. return p;
  126. }
  127. void MemFree(void *mem)
  128. {
  129. if (mem != (void *)null)
  130. free(mem);
  131. }
  132. void ClearMemory(void *mem, uint size)
  133. {
  134. memset(mem, 0, size);
  135. }
  136. StreamIn *OpenInput(FILE *fp)
  137. {
  138. StreamIn *in;
  139. in = (StreamIn *)MemAlloc(sizeof(StreamIn));
  140. in->file = fp;
  141. in->pushed = no;
  142. in->c = '\0';
  143. in->tabs = 0;
  144. in->curline = 1;
  145. in->curcol = 1;
  146. in->encoding = CharEncoding;
  147. in->state = FSM_ASCII;
  148. return in;
  149. }
  150. /* read char from stream */
  151. static int ReadCharFromStream(StreamIn *in)
  152. {
  153. uint n, c, i, count;
  154. if (feof(in->file))
  155. return -1;
  156. c = getc(in->file);
  157. /*
  158. A document in ISO-2022 based encoding uses some ESC sequences
  159. called "designator" to switch character sets. The designators
  160. defined and used in ISO-2022-JP are:
  161. "ESC" + "(" + ? for ISO646 variants
  162. "ESC" + "$" + ? and
  163. "ESC" + "$" + "(" + ? for multibyte character sets
  164. Where ? stands for a single character used to indicate the
  165. character set for multibyte characters.
  166. Tidy handles this by preserving the escape sequence and
  167. setting the top bit of each byte for non-ascii chars. This
  168. bit is then cleared on output. The input stream keeps track
  169. of the state to determine when to set/clear the bit.
  170. */
  171. if (in->encoding == ISO2022)
  172. {
  173. if (c == 0x1b) /* ESC */
  174. {
  175. in->state = FSM_ESC;
  176. return c;
  177. }
  178. switch (in->state)
  179. {
  180. case FSM_ESC:
  181. if (c == '$')
  182. in->state = FSM_ESCD;
  183. else if (c == '(')
  184. in->state = FSM_ESCP;
  185. else
  186. in->state = FSM_ASCII;
  187. break;
  188. case FSM_ESCD:
  189. if (c == '(')
  190. in->state = FSM_ESCDP;
  191. else
  192. in->state = FSM_NONASCII;
  193. break;
  194. case FSM_ESCDP:
  195. in->state = FSM_NONASCII;
  196. break;
  197. case FSM_ESCP:
  198. in->state = FSM_ASCII;
  199. break;
  200. case FSM_NONASCII:
  201. c |= 0x80;
  202. break;
  203. }
  204. return c;
  205. }
  206. if (in->encoding != UTF8)
  207. return c;
  208. /* deal with UTF-8 encoded char */
  209. if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */
  210. {
  211. n = c & 31;
  212. count = 1;
  213. }
  214. else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
  215. {
  216. n = c & 15;
  217. count = 2;
  218. }
  219. else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
  220. {
  221. n = c & 7;
  222. count = 3;
  223. }
  224. else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */
  225. {
  226. n = c & 3;
  227. count = 4;
  228. }
  229. else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */
  230. {
  231. n = c & 1;
  232. count = 5;
  233. }
  234. else /* 0XXX XXXX one byte */
  235. return c;
  236. /* successor bytes should have the form 10XX XXXX */
  237. for (i = 1; i <= count; ++i)
  238. {
  239. if (feof(in->file))
  240. return -1;
  241. c = getc(in->file);
  242. n = (n << 6) | (c & 0x3F);
  243. }
  244. return n;
  245. }
  246. int ReadChar(StreamIn *in)
  247. {
  248. int c;
  249. if (in->pushed)
  250. {
  251. in->pushed = no;
  252. c = in->c;
  253. if (c == '\n')
  254. {
  255. in->curcol = 1;
  256. in->curline++;
  257. return c;
  258. }
  259. in->curcol++;
  260. return c;
  261. }
  262. in->lastcol = in->curcol;
  263. if (in->tabs > 0)
  264. {
  265. in->curcol++;
  266. in->tabs--;
  267. return ' ';
  268. }
  269. for (;;)
  270. {
  271. c = ReadCharFromStream(in);
  272. if (c < 0)
  273. return EndOfStream;
  274. if (c == '\n')
  275. {
  276. in->curcol = 1;
  277. in->curline++;
  278. break;
  279. }
  280. if (c == '\t')
  281. {
  282. in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;
  283. in->curcol++;
  284. c = ' ';
  285. break;
  286. }
  287. /* strip control characters, except for Esc */
  288. if (c == '\033')
  289. break;
  290. if (0 < c && c < 32)
  291. continue;
  292. /* watch out for IS02022 */
  293. if (in->encoding == RAW || in->encoding == ISO2022)
  294. {
  295. in->curcol++;
  296. break;
  297. }
  298. if (in->encoding == MACROMAN)
  299. c = Mac2Unicode[c];
  300. /* produced e.g. as a side-effect of smart quotes in Word */
  301. if (127 < c && c < 160)
  302. {
  303. ReportEncodingError(in->lexer, WINDOWS_CHARS, c);
  304. c = Win2Unicode[c - 128];
  305. if (c == 0)
  306. continue;
  307. }
  308. in->curcol++;
  309. break;
  310. }
  311. return c;
  312. }
  313. void UngetChar(int c, StreamIn *in)
  314. {
  315. in->pushed = yes;
  316. in->c = c;
  317. if (c == '\n')
  318. --(in->curline);
  319. in->curcol = in->lastcol;
  320. }
  321. /* like strdup but using MemAlloc */
  322. char *wstrdup(char *str)
  323. {
  324. char *s, *p;
  325. int len;
  326. if (str == null)
  327. return null;
  328. for (len = 0; str[len] != '\0'; ++len);
  329. s = (char *)MemAlloc(sizeof(char)*(1+len));
  330. for (p = s; (*p++ = *str++););
  331. return s;
  332. }
  333. /* like strndup but using MemAlloc */
  334. char *wstrndup(char *str, int len)
  335. {
  336. char *s, *p;
  337. if (str == null || len < 0)
  338. return null;
  339. s = (char *)MemAlloc(sizeof(char)*(1+len));
  340. p = s;
  341. while (len-- > 0 && (*p++ = *str++));
  342. *p = '\0';
  343. return s;
  344. }
  345. /* exactly same as strncpy */
  346. void wstrncpy(char *s1, char *s2, int size)
  347. {
  348. if (s1 != null && s2 != null)
  349. {
  350. if (size >= 0)
  351. {
  352. while (size--)
  353. *s1++ = *s2++;
  354. }
  355. else
  356. while ((*s1++ = *s2++));
  357. }
  358. }
  359. void wstrcpy(char *s1, char *s2)
  360. {
  361. while ((*s1++ = *s2++));
  362. }
  363. void wstrcat(char *s1, char *s2)
  364. {
  365. while (*s1)
  366. ++s1;
  367. while ((*s1++ = *s2++));
  368. }
  369. /* exactly same as strcmp */
  370. int wstrcmp(char *s1, char *s2)
  371. {
  372. int c;
  373. while ((c = *s1) == *s2)
  374. {
  375. if (c == '\0')
  376. return 0;
  377. ++s1;
  378. ++s2;
  379. }
  380. return (*s1 > *s2 ? 1 : -1);
  381. }
  382. /* returns byte count, not char count */
  383. int wstrlen(char *str)
  384. {
  385. int len = 0;
  386. while(*str++)
  387. ++len;
  388. return len;
  389. }
  390. /*
  391. MS C 4.2 doesn't include strcasecmp.
  392. Note that tolower and toupper won't
  393. work on chars > 127
  394. */
  395. int wstrcasecmp(char *s1, char *s2)
  396. {
  397. uint c;
  398. while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2)))
  399. {
  400. if (c == '\0')
  401. return 0;
  402. ++s1;
  403. ++s2;
  404. }
  405. return (*s1 > *s2 ? 1 : -1);
  406. }
  407. int wstrncmp(char *s1, char *s2, int n)
  408. {
  409. int c;
  410. while ((c = *s1) == *s2)
  411. {
  412. if (c == '\0')
  413. return 0;
  414. if (n == 0)
  415. return 0;
  416. ++s1;
  417. ++s2;
  418. --n;
  419. }
  420. if (n == 0)
  421. return 0;
  422. return (*s1 > *s2 ? 1 : -1);
  423. }
  424. int wstrncasecmp(char *s1, char *s2, int n)
  425. {
  426. int c;
  427. while (c = *s1, tolower(c) == tolower(*s2))
  428. {
  429. if (c == '\0')
  430. return 0;
  431. if (n == 0)
  432. return 0;
  433. ++s1;
  434. ++s2;
  435. --n;
  436. }
  437. if (n == 0)
  438. return 0;
  439. return (*s1 > *s2 ? 1 : -1);
  440. }
  441. Bool wsubstr(char *s1, char *s2)
  442. {
  443. int i, len1 = wstrlen(s1), len2 = wstrlen(s2);
  444. for (i = 0; i <= len1 - len2; ++i)
  445. {
  446. if (wstrncasecmp(s1+i, s2, len2) == 0)
  447. return yes;
  448. }
  449. return no;
  450. }
  451. /* For mac users, should we map Unicode back to MacRoman? */
  452. void outc(uint c, Out *out)
  453. {
  454. uint ch;
  455. if (out->encoding == UTF8)
  456. {
  457. if (c < 128)
  458. putc(c, out->fp);
  459. else if (c <= 0x7FF)
  460. {
  461. ch = (0xC0 | (c >> 6)); putc(ch, out->fp);
  462. ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
  463. }
  464. else if (c <= 0xFFFF)
  465. {
  466. ch = (0xE0 | (c >> 12)); putc(ch, out->fp);
  467. ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
  468. ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
  469. }
  470. else if (c <= 0x1FFFFF)
  471. {
  472. ch = (0xF0 | (c >> 18)); putc(ch, out->fp);
  473. ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);
  474. ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
  475. ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
  476. }
  477. else
  478. {
  479. ch = (0xF8 | (c >> 24)); putc(ch, out->fp);
  480. ch = (0x80 | ((c >> 18) & 0x3F)); putc(ch, out->fp);
  481. ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);
  482. ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
  483. ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
  484. }
  485. }
  486. else if (out->encoding == ISO2022)
  487. {
  488. if (c == 0x1b) /* ESC */
  489. out->state = FSM_ESC;
  490. else
  491. {
  492. switch (out->state)
  493. {
  494. case FSM_ESC:
  495. if (c == '$')
  496. out->state = FSM_ESCD;
  497. else if (c == '(')
  498. out->state = FSM_ESCP;
  499. else
  500. out->state = FSM_ASCII;
  501. break;
  502. case FSM_ESCD:
  503. if (c == '(')
  504. out->state = FSM_ESCDP;
  505. else
  506. out->state = FSM_NONASCII;
  507. break;
  508. case FSM_ESCDP:
  509. out->state = FSM_NONASCII;
  510. break;
  511. case FSM_ESCP:
  512. out->state = FSM_ASCII;
  513. break;
  514. case FSM_NONASCII:
  515. c &= 0x7F;
  516. break;
  517. }
  518. }
  519. putc(c, out->fp);
  520. }
  521. else
  522. putc(c, out->fp);
  523. }
  524. /*
  525. first time initialization which should
  526. precede reading the command line
  527. */
  528. void InitTidy(void)
  529. {
  530. InitMap();
  531. InitAttrs();
  532. InitTags();
  533. InitEntities();
  534. InitConfig();
  535. totalerrors = totalwarnings = 0;
  536. XmlTags = XmlOut = HideEndTags = UpperCaseTags =
  537. MakeClean = writeback = OnlyErrors = no;
  538. input = null;
  539. errfile = null;
  540. errout = stderr;
  541. #ifdef CONFIG_FILE
  542. ParseConfigFile(CONFIG_FILE);
  543. #endif
  544. }
  545. /*
  546. call this when you have finished with tidy
  547. to free the hash tables and other resources
  548. */
  549. void DeInitTidy(void)
  550. {
  551. FreeTags();
  552. FreeAttrTable();
  553. FreeEntities();
  554. FreeConfig();
  555. FreePrintBuf();
  556. }
  557. int main(int argc, char **argv)
  558. {
  559. char *file, *prog;
  560. Node *document, *doctype;
  561. Lexer *lexer;
  562. char *s, c, *arg, *current_errorfile = "stderr";
  563. Out out; /* normal output stream */
  564. #if PRESERVEFILETIMES
  565. struct utimbuf filetimes;
  566. struct stat sbuf;
  567. #endif
  568. Bool haveFileTimes;
  569. InitTidy();
  570. /* look for env var "HTML_TIDY" */
  571. /* then for ~/.tidyrc (on Unix) */
  572. if ((file = getenv("HTML_TIDY")))
  573. ParseConfigFile(file);
  574. #ifdef SUPPORT_GETPWNAM
  575. else
  576. ParseConfigFile("~/.tidyrc");
  577. #endif /* SUPPORT_GETPWNAM */
  578. /* read command line */
  579. prog = argv[0];
  580. while (argc > 0)
  581. {
  582. if (argc > 1 && argv[1][0] == '-')
  583. {
  584. /* support -foo and --foo */
  585. arg = argv[1] + 1;
  586. #if 0
  587. if (arg[0] == '-')
  588. ++arg;
  589. #endif
  590. if (strcmp(arg, "indent") == 0)
  591. IndentContent = yes;
  592. else if (strcmp(arg, "xml") == 0)
  593. XmlTags = yes;
  594. else if (strcmp(arg, "asxml") == 0 || strcmp(arg, "asxhtml") == 0)
  595. xHTML = yes;
  596. else if (strcmp(arg, "indent") == 0)
  597. {
  598. IndentContent = yes;
  599. SmartIndent = yes;
  600. }
  601. else if (strcmp(arg, "omit") == 0)
  602. HideEndTags = yes;
  603. else if (strcmp(arg, "upper") == 0)
  604. UpperCaseTags = yes;
  605. else if (strcmp(arg, "clean") == 0)
  606. MakeClean = yes;
  607. else if (strcmp(arg, "raw") == 0)
  608. CharEncoding = RAW;
  609. else if (strcmp(arg, "ascii") == 0)
  610. CharEncoding = ASCII;
  611. else if (strcmp(arg, "latin1") == 0)
  612. CharEncoding = LATIN1;
  613. else if (strcmp(arg, "utf8") == 0)
  614. CharEncoding = UTF8;
  615. else if (strcmp(arg, "iso2022") == 0)
  616. CharEncoding = ISO2022;
  617. else if (strcmp(arg, "mac") == 0)
  618. CharEncoding = MACROMAN;
  619. else if (strcmp(arg, "numeric") == 0)
  620. NumEntities = yes;
  621. else if (strcmp(arg, "modify") == 0)
  622. writeback = yes;
  623. else if (strcmp(arg, "change") == 0) /* obsolete */
  624. writeback = yes;
  625. else if (strcmp(arg, "update") == 0) /* obsolete */
  626. writeback = yes;
  627. else if (strcmp(arg, "errors") == 0)
  628. OnlyErrors = yes;
  629. else if (strcmp(arg, "quiet") == 0)
  630. Quiet = yes;
  631. else if (strcmp(arg, "slides") == 0)
  632. BurstSlides = yes;
  633. else if (strcmp(arg, "help") == 0 ||
  634. argv[1][1] == '?'|| argv[1][1] == 'h')
  635. {
  636. HelpText(stdout, prog);
  637. return 1;
  638. }
  639. else if (strcmp(arg, "config") == 0)
  640. {
  641. if (argc >= 3)
  642. {
  643. ParseConfigFile(argv[2]);
  644. --argc;
  645. ++argv;
  646. }
  647. }
  648. else if (strcmp(argv[1], "-file") == 0 ||
  649. strcmp(argv[1], "--file") == 0 ||
  650. strcmp(argv[1], "-f") == 0)
  651. {
  652. if (argc >= 3)
  653. {
  654. /* create copy that can be freed by FreeConfig() */
  655. errfile = wstrdup(argv[2]);
  656. --argc;
  657. ++argv;
  658. }
  659. }
  660. else if (strcmp(argv[1], "-wrap") == 0 ||
  661. strcmp(argv[1], "--wrap") == 0 ||
  662. strcmp(argv[1], "-w") == 0)
  663. {
  664. if (argc >= 3)
  665. {
  666. sscanf(argv[2], "%d", &wraplen);
  667. --argc;
  668. ++argv;
  669. }
  670. }
  671. else if (strcmp(argv[1], "-version") == 0 ||
  672. strcmp(argv[1], "--version") == 0 ||
  673. strcmp(argv[1], "-v") == 0)
  674. {
  675. ShowVersion(errout);
  676. /* called to free hash tables etc. */
  677. DeInitTidy();
  678. return 0;
  679. }
  680. else if(strncmp(argv[1],"--",2)==0)
  681. {
  682. if (ParseConfig(argv[1]+2, argv[2]))
  683. {
  684. ++argv;
  685. --argc;
  686. }
  687. }
  688. else
  689. {
  690. s = argv[1];
  691. while ((c = *++s))
  692. {
  693. if (c == 'i')
  694. {
  695. IndentContent = yes;
  696. SmartIndent = yes;
  697. }
  698. else if (c == 'o')
  699. HideEndTags = yes;
  700. else if (c == 'u')
  701. UpperCaseTags = yes;
  702. else if (c == 'c')
  703. MakeClean = yes;
  704. else if (c == 'n')
  705. NumEntities = yes;
  706. else if (c == 'm')
  707. writeback = yes;
  708. else if (c == 'e')
  709. OnlyErrors = yes;
  710. else if (c == 'q')
  711. Quiet = yes;
  712. else
  713. UnknownOption(stderr, c);
  714. }
  715. }
  716. --argc;
  717. ++argv;
  718. continue;
  719. }
  720. /* ensure config is self-consistent */
  721. AdjustConfig();
  722. /* user specified error file */
  723. if (errfile)
  724. {
  725. FILE *fp;
  726. /* is it same as the currently opened file? */
  727. if (wstrcmp(errfile, current_errorfile) != 0)
  728. {
  729. /* no so close previous error file */
  730. if (errout != stderr)
  731. fclose(errout);
  732. /* and try to open the new error file */
  733. fp = fopen(errfile, "w");
  734. if (fp != null)
  735. {
  736. errout = fp;
  737. current_errorfile = errfile;
  738. }
  739. else /* can't be opened so fall back to stderr */
  740. {
  741. errout = stderr;
  742. current_errorfile = "stderr";
  743. }
  744. }
  745. }
  746. haveFileTimes = no;
  747. if (argc > 1)
  748. {
  749. file = argv[1];
  750. input = fopen(file, "r");
  751. #if PRESERVEFILETIMES
  752. /* get last modified time */
  753. if (KeepFileTimes && input && fstat(fileno(input), &sbuf) != -1)
  754. {
  755. filetimes.actime = sbuf.st_atime;
  756. filetimes.modtime = sbuf.st_mtime;
  757. haveFileTimes = yes;
  758. }
  759. #endif
  760. }
  761. else
  762. {
  763. input = stdin;
  764. file = "stdin";
  765. }
  766. if (input != null)
  767. {
  768. lexer = NewLexer(OpenInput(input));
  769. lexer->errout = errout;
  770. /*
  771. store pointer to lexer in input stream
  772. to allow character encoding errors to be
  773. reported
  774. */
  775. lexer->in->lexer = lexer;
  776. /* Tidy doesn't alter the doctype for generic XML docs */
  777. if (XmlTags)
  778. document = ParseXMLDocument(lexer);
  779. else
  780. {
  781. lexer->warnings = 0;
  782. if (!Quiet)
  783. HelloMessage(errout, release_date, file);
  784. document = ParseDocument(lexer);
  785. if (!CheckNodeIntegrity(document))
  786. {
  787. fprintf(stderr, "\nPanic - tree has lost its integrity\n");
  788. exit(1);
  789. }
  790. /* simplifies <b><b> ... </b> ...</b> etc. */
  791. NestedEmphasis(document);
  792. /* cleans up <dir>indented text</dir> etc. */
  793. List2BQ(document);
  794. BQ2Div(document);
  795. /* replaces i by em and b by strong */
  796. if (LogicalEmphasis)
  797. EmFromI(document);
  798. if (Word2000 && IsWord2000(document))
  799. {
  800. /* prune Word2000's <![if ...]> ... <![endif]> */
  801. DropSections(lexer, document);
  802. /* drop style & class attributes and empty p, span elements */
  803. CleanWord2000(lexer, document);
  804. }
  805. /* replaces presentational markup by style rules */
  806. if (MakeClean || DropFontTags)
  807. CleanTree(lexer, document);
  808. if (!CheckNodeIntegrity(document))
  809. {
  810. fprintf(stderr, "\nPanic - tree has lost its integrity\n");
  811. exit(1);
  812. }
  813. doctype = FindDocType(document);
  814. if (document->content)
  815. {
  816. if (xHTML)
  817. SetXHTMLDocType(lexer, document);
  818. else
  819. FixDocType(lexer, document);
  820. if (TidyMark)
  821. AddGenerator(lexer, document);
  822. }
  823. /* ensure presence of initial <?XML version="1.0"?> */
  824. if (XmlOut && XmlPi)
  825. FixXMLPI(lexer, document);
  826. totalwarnings += lexer->warnings;
  827. totalerrors += lexer->errors;
  828. if (!Quiet && document->content)
  829. {
  830. ReportVersion(errout, lexer, file, doctype);
  831. ReportNumWarnings(errout, lexer);
  832. }
  833. }
  834. if (input != stdin)
  835. {
  836. fclose(input);
  837. }
  838. MemFree(lexer->in);
  839. if (lexer->errors > 0)
  840. NeedsAuthorIntervention(errout);
  841. out.state = FSM_ASCII;
  842. out.encoding = CharEncoding;
  843. if (!OnlyErrors && lexer->errors == 0)
  844. {
  845. if (BurstSlides)
  846. {
  847. Node *body, *this_doctype;
  848. /*
  849. remove doctype to avoid potential clash with
  850. markup introduced when bursting into slides
  851. */
  852. /* discard the document type */
  853. this_doctype = FindDocType(document);
  854. if (this_doctype)
  855. DiscardElement(this_doctype);
  856. /* slides use transitional features */
  857. lexer->versions |= VERS_HTML40_LOOSE;
  858. /* and patch up doctype to match */
  859. if (xHTML)
  860. SetXHTMLDocType(lexer, document);
  861. else
  862. FixDocType(lexer, document);
  863. /* find the body element which may be implicit */
  864. body = FindBody(document);
  865. if (body)
  866. {
  867. ReportNumberOfSlides(errout, CountSlides(body));
  868. CreateSlides(lexer, document);
  869. }
  870. else
  871. MissingBody(errout);
  872. }
  873. else if (writeback && (input = fopen(file, "w")))
  874. {
  875. out.fp = input;
  876. if (XmlTags)
  877. PPrintXMLTree(&out, null, 0, lexer, document);
  878. else
  879. PPrintTree(&out, null, 0, lexer, document);
  880. PFlushLine(&out, 0);
  881. #if PRESERVEFILETIMES
  882. /* set file last accessed/modified times to original values */
  883. if (haveFileTimes)
  884. futime(fileno(input), &filetimes);
  885. #endif
  886. fclose(input);
  887. }
  888. else
  889. {
  890. out.fp = stdout;
  891. if (XmlTags)
  892. PPrintXMLTree(&out, null, 0, lexer, document);
  893. else
  894. PPrintTree(&out, null, 0, lexer, document);
  895. PFlushLine(&out, 0);
  896. }
  897. }
  898. ErrorSummary(lexer);
  899. FreeNode(document);
  900. FreeLexer(lexer);
  901. }
  902. else
  903. UnknownFile(errout, prog, file);
  904. --argc;
  905. ++argv;
  906. if (argc <= 1)
  907. break;
  908. }
  909. if (totalerrors + totalwarnings > 0)
  910. GeneralInfo(errout);
  911. if (errout != stderr)
  912. fclose(errout);
  913. /* called to free hash tables etc. */
  914. DeInitTidy();
  915. /* return status can be used by scripts */
  916. if (totalerrors > 0)
  917. return 2;
  918. if (totalwarnings > 0)
  919. return 1;
  920. /* 0 signifies all is ok */
  921. return 0;
  922. }