PageRenderTime 70ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 1ms

/src/wml_aux/tidy/lexer.c

https://bitbucket.org/shlomif/website-meta-language
C | 2813 lines | 2134 code | 449 blank | 230 comment | 508 complexity | 15657d6d33d5dafd1d1ff4ea2589c6f9 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0, LGPL-2.0
  1. /*
  2. lexer.c - Lexer for html parser
  3. (c) 1998-2000 (W3C) MIT, INRIA, Keio University
  4. See tidy.c for the copyright notice.
  5. */
  6. /*
  7. Given a file stream fp it returns a sequence of tokens.
  8. GetToken(fp) gets the next token
  9. UngetToken(fp) provides one level undo
  10. The tags include an attribute list:
  11. - linked list of attribute/value nodes
  12. - each node has 2 null-terminated strings.
  13. - entities are replaced in attribute values
  14. white space is compacted if not in preformatted mode
  15. If not in preformatted mode then leading white space
  16. is discarded and subsequent white space sequences
  17. compacted to single space chars.
  18. If XmlTags is no then Tag names are folded to upper
  19. case and attribute names to lower case.
  20. Not yet done:
  21. - Doctype subset and marked sections
  22. */
  23. #include "platform.h"
  24. #include "html.h"
  25. AttVal *ParseAttrs(Lexer *lexer, Bool *isempty); /* forward references */
  26. Node *CommentToken(Lexer *lexer);
  27. /* used to classify chars for lexical purposes */
  28. #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
  29. uint lexmap[128];
  30. #define XHTML_NAMESPACE "http://www.w3.org/1999/xhtml"
  31. /* the 3 URIs for the XHTML 1.0 DTDs */
  32. #define voyager_loose "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
  33. #define voyager_strict "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
  34. #define voyager_frameset "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
  35. #define W3C_VERSIONS 8
  36. struct _vers
  37. {
  38. char *name;
  39. char *voyager_name;
  40. char *profile;
  41. int code;
  42. } W3C_Version[] =
  43. {
  44. {"HTML 4.01", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT},
  45. {"HTML 4.01 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE},
  46. {"HTML 4.01 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES},
  47. {"HTML 4.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT},
  48. {"HTML 4.0 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE},
  49. {"HTML 4.0 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES},
  50. {"HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML32},
  51. {"HTML 2.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML20}
  52. };
  53. Bool IsWhite(uint c)
  54. {
  55. uint map = MAP(c);
  56. return (Bool)(map & white);
  57. }
  58. Bool IsDigit(uint c)
  59. {
  60. uint map;
  61. map = MAP(c);
  62. return (Bool)(map & digit);
  63. }
  64. Bool IsLetter(uint c)
  65. {
  66. uint map;
  67. map = MAP(c);
  68. return (Bool)(map & letter);
  69. }
  70. uint ToLower(uint c)
  71. {
  72. uint map = MAP(c);
  73. if (map & uppercase)
  74. c += 'a' - 'A';
  75. return c;
  76. }
  77. #if 0
  78. uint ToUpper(uint c)
  79. {
  80. uint map = MAP(c);
  81. if (map & lowercase)
  82. c += 'A' - 'a';
  83. return c;
  84. }
  85. #endif
  86. char FoldCase(char c, Bool tocaps)
  87. {
  88. uint map;
  89. if (!XmlTags)
  90. {
  91. map = MAP(c);
  92. if (tocaps)
  93. {
  94. if (map & lowercase)
  95. c += 'A' - 'a';
  96. }
  97. else /* force to lower case */
  98. {
  99. if (map & uppercase)
  100. c += 'a' - 'A';
  101. }
  102. }
  103. return c;
  104. }
  105. /*
  106. node->type is one of these:
  107. #define TextNode 1
  108. #define StartTag 2
  109. #define EndTag 3
  110. #define StartEndTag 4
  111. */
  112. Lexer *NewLexer(StreamIn *in)
  113. {
  114. Lexer *lexer;
  115. lexer = (Lexer *)MemAlloc(sizeof(Lexer));
  116. if (lexer != null)
  117. {
  118. lexer->in = in;
  119. lexer->lines = 1;
  120. lexer->columns = 1;
  121. lexer->state = LEX_CONTENT;
  122. lexer->badAccess = 0;
  123. lexer->badLayout = 0;
  124. lexer->badChars = 0;
  125. lexer->badForm = 0;
  126. lexer->warnings = 0;
  127. lexer->errors = no;
  128. lexer->waswhite = no;
  129. lexer->pushed = no;
  130. lexer->insertspace = no;
  131. lexer->exiled = no;
  132. lexer->isvoyager = no;
  133. lexer->versions = VERS_EVERYTHING;
  134. lexer->doctype = VERS_UNKNOWN;
  135. lexer->bad_doctype = no;
  136. lexer->txtstart = 0;
  137. lexer->txtend = 0;
  138. lexer->token = null;
  139. lexer->lexbuf = null;
  140. lexer->lexlength = 0;
  141. lexer->lexsize = 0;
  142. lexer->inode = null;
  143. lexer->insert = null;
  144. lexer->istack = null;
  145. lexer->istacklength = 0;
  146. lexer->istacksize = 0;
  147. lexer->istackbase = 0;
  148. lexer->styles = null;
  149. }
  150. return lexer;
  151. }
  152. Bool EndOfInput(Lexer *lexer)
  153. {
  154. return (feof(lexer->in->file));
  155. }
  156. void FreeLexer(Lexer *lexer)
  157. {
  158. if (lexer->pushed)
  159. FreeNode(lexer->token);
  160. if (lexer->lexbuf != null)
  161. MemFree(lexer->lexbuf);
  162. while (lexer->istacksize > 0)
  163. PopInline(lexer, null);
  164. if (lexer->istack)
  165. MemFree(lexer->istack);
  166. if (lexer->styles)
  167. FreeStyles(lexer);
  168. MemFree(lexer);
  169. }
  170. static void AddByte(Lexer *lexer, uint c)
  171. {
  172. if (lexer->lexsize + 1 >= lexer->lexlength)
  173. {
  174. while (lexer->lexsize + 1 >= lexer->lexlength)
  175. {
  176. if (lexer->lexlength == 0)
  177. lexer->lexlength = 8192;
  178. else
  179. lexer->lexlength = lexer->lexlength * 2;
  180. }
  181. lexer->lexbuf = (char *)MemRealloc(lexer->lexbuf, lexer->lexlength*sizeof(char));
  182. }
  183. lexer->lexbuf[lexer->lexsize++] = (char)c;
  184. lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */
  185. }
  186. static void ChangeChar(Lexer *lexer, char c)
  187. {
  188. if (lexer->lexsize > 0)
  189. {
  190. lexer->lexbuf[lexer->lexsize-1] = c;
  191. }
  192. }
  193. /* store char c as UTF-8 encoded byte stream */
  194. void AddCharToLexer(Lexer *lexer, uint c)
  195. {
  196. if (c < 128)
  197. AddByte(lexer, c);
  198. else if (c <= 0x7FF)
  199. {
  200. AddByte(lexer, 0xC0 | (c >> 6));
  201. AddByte(lexer, 0x80 | (c & 0x3F));
  202. }
  203. else if (c <= 0xFFFF)
  204. {
  205. AddByte(lexer, 0xE0 | (c >> 12));
  206. AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));
  207. AddByte(lexer, 0x80 | (c & 0x3F));
  208. }
  209. else if (c <= 0x1FFFFF)
  210. {
  211. AddByte(lexer, 0xF0 | (c >> 18));
  212. AddByte(lexer, 0x80 | ((c >> 12) & 0x3F));
  213. AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));
  214. AddByte(lexer, 0x80 | (c & 0x3F));
  215. }
  216. else
  217. {
  218. AddByte(lexer, 0xF8 | (c >> 24));
  219. AddByte(lexer, 0x80 | ((c >> 18) & 0x3F));
  220. AddByte(lexer, 0x80 | ((c >> 12) & 0x3F));
  221. AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));
  222. AddByte(lexer, 0x80 | (c & 0x3F));
  223. }
  224. }
  225. #if 0
  226. static void AddStringToLexer(Lexer *lexer, char *str)
  227. {
  228. uint c;
  229. while((c = *str++))
  230. AddCharToLexer(lexer, c);
  231. }
  232. #endif
  233. /*
  234. No longer attempts to insert missing ';' for unknown
  235. enitities unless one was present already, since this
  236. gives unexpected results.
  237. For example: <a href="something.htm?foo&bar&fred">
  238. was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
  239. rather than: <a href="something.htm?foo&amp;bar&amp;fred">
  240. My thanks for Maurice Buxton for spotting this.
  241. */
  242. static void ParseEntity(Lexer *lexer, int mode)
  243. {
  244. uint start, map;
  245. Bool first = yes, semicolon = no;
  246. int c, ch, startcol;
  247. start = lexer->lexsize - 1; /* to start at "&" */
  248. startcol = lexer->in->curcol - 1;
  249. while ((c = ReadChar(lexer->in)) != EndOfStream)
  250. {
  251. if (c == ';')
  252. {
  253. semicolon = yes;
  254. break;
  255. }
  256. if (first && c == '#')
  257. {
  258. AddCharToLexer(lexer, c);
  259. first = no;
  260. continue;
  261. }
  262. first = no;
  263. map = MAP(c);
  264. if (map & namechar)
  265. {
  266. AddCharToLexer(lexer, c);
  267. continue;
  268. }
  269. /* otherwise put it back */
  270. UngetChar(c, lexer->in);
  271. break;
  272. }
  273. /* make sure entity is null terminated */
  274. lexer->lexbuf[lexer->lexsize] = '\0';
  275. ch = EntityCode(lexer->lexbuf+start);
  276. /* deal with unrecognized entities */
  277. if (ch <= 0)
  278. {
  279. /* set error position just before offending chararcter */
  280. lexer->lines = lexer->in->curline;
  281. lexer->columns = startcol;
  282. if (lexer->lexsize > start +1 )
  283. {
  284. ReportEntityError(lexer, UNKNOWN_ENTITY, lexer->lexbuf+start, ch);
  285. if (semicolon)
  286. AddCharToLexer(lexer, ';');
  287. }
  288. else /* naked & */
  289. ReportEntityError(lexer, UNESCAPED_AMPERSAND, lexer->lexbuf+start, ch);
  290. }
  291. else
  292. {
  293. if (c != ';') /* issue warning if not terminated by ';' */
  294. {
  295. /* set error position just before offending chararcter */
  296. lexer->lines = lexer->in->curline;
  297. lexer->columns = startcol;
  298. ReportEntityError(lexer, MISSING_SEMICOLON, lexer->lexbuf+start, c);
  299. }
  300. lexer->lexsize = start;
  301. if (ch == 160 && (mode & Preformatted))
  302. ch = ' ';
  303. AddCharToLexer(lexer, ch);
  304. if (ch == '&' && !QuoteAmpersand)
  305. {
  306. AddCharToLexer(lexer, 'a');
  307. AddCharToLexer(lexer, 'm');
  308. AddCharToLexer(lexer, 'p');
  309. AddCharToLexer(lexer, ';');
  310. }
  311. }
  312. }
  313. static char ParseTagName(Lexer *lexer)
  314. {
  315. int map;
  316. uint c;
  317. /* fold case of first char in buffer */
  318. c = lexer->lexbuf[lexer->txtstart];
  319. map = MAP(c);
  320. if (!XmlTags && (map & uppercase) != 0)
  321. {
  322. c -= (uint)('A' - 'a');
  323. lexer->lexbuf[lexer->txtstart] = c;
  324. }
  325. while ((c = ReadChar(lexer->in)) != EndOfStream)
  326. {
  327. map = MAP(c);
  328. if ((map & namechar) == 0)
  329. break;
  330. /* fold case of subsequent chars */
  331. if (!XmlTags && (map & uppercase) != 0)
  332. c -= (uint)('A' - 'a');
  333. AddCharToLexer(lexer, c);
  334. }
  335. lexer->txtend = lexer->lexsize;
  336. return c;
  337. }
  338. /*
  339. Used for elements and text nodes
  340. element name is null for text nodes
  341. start and end are offsets into lexbuf
  342. which contains the textual content of
  343. all elements in the parse tree.
  344. parent and content allow traversal
  345. of the parse tree in any direction.
  346. attributes are represented as a linked
  347. list of AttVal nodes which hold the
  348. strings for attribute/value pairs.
  349. */
  350. Node *NewNode(void)
  351. {
  352. Node *node;
  353. node = (Node *)MemAlloc(sizeof(Node));
  354. node->parent = null;
  355. node->prev = null;
  356. node->next = null;
  357. node->last = null;
  358. node->start = 0;
  359. node->end = 0;
  360. node->type = TextNode;
  361. node->closed = no;
  362. node->implicit = no;
  363. node->linebreak = no;
  364. node->tag = null;
  365. node->was = null;
  366. node->element = null;
  367. node->attributes = null;
  368. node->content = null;
  369. return node;
  370. }
  371. /* used to clone heading nodes when split by an <HR> */
  372. Node *CloneNode(Lexer *lexer, Node *element)
  373. {
  374. Node *node;
  375. node = NewNode();
  376. node->parent = element->parent;
  377. node->start = lexer->lexsize;
  378. node->end = lexer->lexsize;
  379. node->type = element->type;
  380. node->closed = element->closed;
  381. node->implicit = element->implicit;
  382. node->tag = element->tag;
  383. node->element = wstrdup(element->element);
  384. node->attributes = DupAttrs(element->attributes);
  385. return node;
  386. }
  387. /* free node's attributes */
  388. void FreeAttrs(Node *node)
  389. {
  390. AttVal *av;
  391. while (node->attributes)
  392. {
  393. av = node->attributes;
  394. if (av->attribute)
  395. MemFree(av->attribute);
  396. if (av->value)
  397. MemFree(av->value);
  398. node->attributes = av->next;
  399. MemFree(av);
  400. }
  401. }
  402. /* doesn't repair attribute list linkage */
  403. void FreeAttribute(AttVal *av)
  404. {
  405. if (av->attribute)
  406. MemFree(av->attribute);
  407. if (av->value)
  408. MemFree(av->value);
  409. MemFree(av);
  410. }
  411. /* remove attribute from node then free it */
  412. void RemoveAttribute(Node *node, AttVal *attr)
  413. {
  414. AttVal *av, *prev = null, *next;
  415. for (av = node->attributes; av != null; av = next)
  416. {
  417. next = av->next;
  418. if (av == attr)
  419. {
  420. if (prev)
  421. prev->next = next;
  422. else
  423. node->attributes = next;
  424. }
  425. else
  426. prev = av;
  427. }
  428. FreeAttribute(attr);
  429. }
  430. /*
  431. Free document nodes by iterating through peers and recursing
  432. through children. Set next to null before calling FreeNode()
  433. to avoid freeing peer nodes. Doesn't patch up prev/next links.
  434. */
  435. void FreeNode(Node *node)
  436. {
  437. AttVal *av;
  438. Node *next;
  439. while (node)
  440. {
  441. while (node->attributes)
  442. {
  443. av = node->attributes;
  444. if (av->attribute)
  445. MemFree(av->attribute);
  446. if (av->value)
  447. MemFree(av->value);
  448. node->attributes = av->next;
  449. MemFree(av);
  450. }
  451. if (node->element)
  452. MemFree(node->element);
  453. if (node->content)
  454. FreeNode(node->content);
  455. if (node->next)
  456. {
  457. next = node->next;
  458. MemFree(node);
  459. node = next;
  460. continue;
  461. }
  462. node->element = null;
  463. node->tag = null;
  464. #if 0
  465. if (_msize(node) != sizeof (Node)) /* debug */
  466. fprintf(stderr,
  467. "Error in FreeNode() - trying to free corrupted node size %d vs %d\n",
  468. _msize(node), sizeof(Node));
  469. #endif
  470. MemFree(node);
  471. break;
  472. }
  473. }
  474. Node *TextToken(Lexer *lexer)
  475. {
  476. Node *node;
  477. node = NewNode();
  478. node->start = lexer->txtstart;
  479. node->end = lexer->txtend;
  480. return node;
  481. }
  482. /* used for creating preformatted text from Word2000 */
  483. Node *NewLineNode(Lexer *lexer)
  484. {
  485. Node *node = NewNode();
  486. node->start = lexer->lexsize;
  487. AddCharToLexer(lexer, (uint)'\n');
  488. node->end = lexer->lexsize;
  489. return node;
  490. }
  491. static Node *TagToken(Lexer *lexer, uint type)
  492. {
  493. Node *node;
  494. node = NewNode();
  495. node->type = type;
  496. node->element = wstrndup(lexer->lexbuf + lexer->txtstart,
  497. lexer->txtend - lexer->txtstart);
  498. node->start = lexer->txtstart;
  499. node->end = lexer->txtstart;
  500. if (type == StartTag || type == StartEndTag || type == EndTag)
  501. FindTag(node);
  502. return node;
  503. }
  504. Node *CommentToken(Lexer *lexer)
  505. {
  506. Node *node;
  507. node = NewNode();
  508. node->type = CommentTag;
  509. node->start = lexer->txtstart;
  510. node->end = lexer->txtend;
  511. return node;
  512. }
  513. static Node *DocTypeToken(Lexer *lexer)
  514. {
  515. Node *node;
  516. node = NewNode();
  517. node->type = DocTypeTag;
  518. node->start = lexer->txtstart;
  519. node->end = lexer->txtend;
  520. return node;
  521. }
  522. static Node *PIToken(Lexer *lexer)
  523. {
  524. Node *node;
  525. node = NewNode();
  526. node->type = ProcInsTag;
  527. node->start = lexer->txtstart;
  528. node->end = lexer->txtend;
  529. return node;
  530. }
  531. static Node *AspToken(Lexer *lexer)
  532. {
  533. Node *node;
  534. node = NewNode();
  535. node->type = AspTag;
  536. node->start = lexer->txtstart;
  537. node->end = lexer->txtend;
  538. return node;
  539. }
  540. static Node *JsteToken(Lexer *lexer)
  541. {
  542. Node *node;
  543. node = NewNode();
  544. node->type = JsteTag;
  545. node->start = lexer->txtstart;
  546. node->end = lexer->txtend;
  547. return node;
  548. }
  549. /* Added by Baruch Even - handle PHP code too. */
  550. static Node *PhpToken(Lexer *lexer)
  551. {
  552. Node *node;
  553. node = NewNode();
  554. node->type = PhpTag;
  555. node->start = lexer->txtstart;
  556. node->end = lexer->txtend;
  557. return node;
  558. }
  559. /* Word2000 uses <![if ... ]> and <![endif]> */
  560. static Node *SectionToken(Lexer *lexer)
  561. {
  562. Node *node;
  563. node = NewNode();
  564. node->type = SectionTag;
  565. node->start = lexer->txtstart;
  566. node->end = lexer->txtend;
  567. return node;
  568. }
  569. /* CDATA uses <![CDATA[ ... ]]> */
  570. static Node *CDATAToken(Lexer *lexer)
  571. {
  572. Node *node;
  573. node = NewNode();
  574. node->type = CDATATag;
  575. node->start = lexer->txtstart;
  576. node->end = lexer->txtend;
  577. return node;
  578. }
  579. void AddStringLiteral(Lexer *lexer, char *str)
  580. {
  581. unsigned char c;
  582. while((c = *str++) != '\0')
  583. AddCharToLexer(lexer, c);
  584. }
  585. /* find doctype element */
  586. Node *FindDocType(Node *root)
  587. {
  588. Node *node;
  589. for (node = root->content;
  590. node && node->type != DocTypeTag; node = node->next);
  591. return node;
  592. }
  593. /* find html element */
  594. Node *FindHTML(Node *root)
  595. {
  596. Node *node;
  597. for (node = root->content;
  598. node && node->tag != tag_html; node = node->next);
  599. return node;
  600. }
  601. Node *FindHEAD(Node *root)
  602. {
  603. Node *node;
  604. node = FindHTML(root);
  605. if (node)
  606. {
  607. for (node = node->content;
  608. node && node->tag != tag_head; node = node->next);
  609. }
  610. return node;
  611. }
  612. /* add meta element for Tidy */
  613. Bool AddGenerator(Lexer *lexer, Node *root)
  614. {
  615. AttVal *attval;
  616. Node *node;
  617. Node *head = FindHEAD(root);
  618. if (head)
  619. {
  620. for (node = head->content; node; node = node->next)
  621. {
  622. if (node->tag == tag_meta)
  623. {
  624. attval = GetAttrByName(node, "name");
  625. if (attval && attval->value &&
  626. wstrcasecmp(attval->value, "generator") == 0)
  627. {
  628. attval = GetAttrByName(node, "content");
  629. if (attval && attval->value &&
  630. wstrncasecmp(attval->value, "HTML Tidy", 9) == 0)
  631. {
  632. return no;
  633. }
  634. }
  635. }
  636. }
  637. node = InferredTag(lexer, "meta");
  638. AddAttribute(node, "content", "HTML Tidy, see www.w3.org");
  639. AddAttribute(node, "name", "generator");
  640. InsertNodeAtStart(head, node);
  641. return yes;
  642. }
  643. return no;
  644. }
  645. /* examine <!DOCTYPE> to identify version */
  646. static int FindGivenVersion(Lexer *lexer, Node *doctype)
  647. {
  648. char *p, *s = lexer->lexbuf+doctype->start;
  649. uint i, j;
  650. int len;
  651. /* if root tag for doctype isn't html give up now */
  652. if (wstrncasecmp(s, "html ", 5) != 0)
  653. return 0;
  654. s += 5; /* if all is well s -> SYSTEM or PUBLIC */
  655. if (!CheckDocTypeKeyWords(lexer, doctype))
  656. ReportWarning(lexer, doctype, null, DTYPE_NOT_UPPER_CASE);
  657. /* give up if all we are given is the system id for the doctype */
  658. if (wstrncasecmp(s, "SYSTEM ", 7) == 0)
  659. {
  660. /* but at least ensure the case is correct */
  661. if (wstrncmp(s, "SYSTEM", 6) != 0)
  662. memcpy(s, "SYSTEM", 6);
  663. return 0; /* unrecognized */
  664. }
  665. if (wstrncasecmp(s, "PUBLIC ", 7) == 0)
  666. {
  667. if (wstrncmp(s, "PUBLIC", 6) != 0)
  668. memcpy(s, "PUBLIC", 6);
  669. }
  670. else
  671. lexer->bad_doctype = yes;
  672. for (i = doctype->start; i < doctype->end; ++i)
  673. {
  674. if (lexer->lexbuf[i] == '"')
  675. {
  676. if (wstrncmp(lexer->lexbuf+i+1, "-//W3C//DTD ", 12) == 0)
  677. {
  678. p = lexer->lexbuf + i + 13;
  679. /* compute length of identifier e.g. "HTML 4.0 Transitional" */
  680. for (j = i + 13; j < doctype->end && lexer->lexbuf[j] != '/'; ++j);
  681. len = j - i - 13;
  682. for (j = 1; j < W3C_VERSIONS; ++j)
  683. {
  684. s = W3C_Version[j].name;
  685. if (len == wstrlen(s) && wstrncmp(p, s, len) == 0)
  686. return W3C_Version[j].code;
  687. }
  688. /* else unrecognized version */
  689. }
  690. else if (wstrncmp(lexer->lexbuf+i+1, "-//IETF//DTD ", 13) == 0)
  691. {
  692. p = lexer->lexbuf + i + 14;
  693. /* compute length of identifier e.g. "HTML 2.0" */
  694. for (j = i + 14; j < doctype->end && lexer->lexbuf[j] != '/'; ++j);
  695. len = j - i - 14;
  696. s = W3C_Version[0].name;
  697. if (len == wstrlen(s) && wstrncmp(p, s, len) == 0)
  698. return W3C_Version[0].code;
  699. /* else unrecognized version */
  700. }
  701. break;
  702. }
  703. }
  704. return 0;
  705. }
  706. /* return true if substring s is in p and isn't all in upper case */
  707. /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
  708. /* len is how many chars to check in p */
  709. static Bool FindBadSubString(char *s, char *p, int len)
  710. {
  711. int n = wstrlen(s);
  712. while (n < len)
  713. {
  714. if (wstrncasecmp(s, p, n) == 0)
  715. return (wstrncmp(s, p, n) != 0);
  716. ++p;
  717. --len;
  718. }
  719. return 0;
  720. }
  721. Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype)
  722. {
  723. char *s = lexer->lexbuf+doctype->start;
  724. int len = doctype->end - doctype->start;
  725. return !(
  726. FindBadSubString("SYSTEM", s, len) ||
  727. FindBadSubString("PUBLIC", s, len) ||
  728. FindBadSubString("//DTD", s, len) ||
  729. FindBadSubString("//W3C", s, len) ||
  730. FindBadSubString("//EN", s, len)
  731. );
  732. }
  733. char *HTMLVersionName(Lexer *lexer)
  734. {
  735. int guessed, j;
  736. guessed = ApparentVersion(lexer);
  737. for (j = 0; j < W3C_VERSIONS; ++j)
  738. {
  739. if (guessed == W3C_Version[j].code)
  740. {
  741. if (lexer->isvoyager)
  742. return W3C_Version[j].voyager_name;
  743. return W3C_Version[j].name;
  744. }
  745. }
  746. return null;
  747. }
  748. static void FixHTMLNameSpace(Lexer *lexer, Node *root, char *profile)
  749. {
  750. Node *node;
  751. AttVal *prev, *attr;
  752. for (node = root->content;
  753. node && node->tag != tag_html; node = node->next);
  754. if (node)
  755. {
  756. prev = null;
  757. for (attr = node->attributes; attr; attr = attr->next)
  758. {
  759. if (wstrcmp(attr->attribute, "xmlns") == 0)
  760. break;
  761. prev = attr;
  762. }
  763. if (attr)
  764. {
  765. if (wstrcmp(attr->value, profile))
  766. {
  767. ReportWarning(lexer, node, null, INCONSISTENT_NAMESPACE);
  768. MemFree(attr->value);
  769. attr->value = wstrdup(profile);
  770. }
  771. }
  772. else
  773. {
  774. attr = NewAttribute();
  775. attr->delim = '"';
  776. attr->attribute = wstrdup("xmlns");
  777. attr->value = wstrdup(profile);
  778. attr->dict = FindAttribute(attr);
  779. attr->next = node->attributes;
  780. node->attributes = attr;
  781. }
  782. }
  783. }
  784. Bool SetXHTMLDocType(Lexer *lexer, Node *root)
  785. {
  786. char *fpi = null, *sysid = null, *name_space = XHTML_NAMESPACE;
  787. Node *doctype;
  788. doctype = FindDocType(root);
  789. if (doctype_mode == doctype_omit)
  790. {
  791. if (doctype)
  792. DiscardElement(doctype);
  793. return yes;
  794. }
  795. if (doctype_mode == doctype_auto)
  796. {
  797. /* see what flavor of XHTML this document matches */
  798. if (lexer->versions & VERS_HTML40_STRICT)
  799. { /* use XHTML strict */
  800. fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
  801. sysid = voyager_strict;
  802. }
  803. else if (lexer->versions & VERS_LOOSE)
  804. {
  805. fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
  806. sysid = voyager_loose;
  807. }
  808. else if (lexer->versions & VERS_FRAMES)
  809. { /* use XHTML frames */
  810. fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
  811. sysid = voyager_frameset;
  812. }
  813. else /* lets assume XHTML transitional */
  814. {
  815. fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
  816. sysid = voyager_loose;
  817. }
  818. }
  819. else if (doctype_mode == doctype_strict)
  820. {
  821. fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
  822. sysid = voyager_strict;
  823. }
  824. else if (doctype_mode == doctype_loose)
  825. {
  826. fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
  827. sysid = voyager_loose;
  828. }
  829. FixHTMLNameSpace(lexer, root, name_space);
  830. if (!doctype)
  831. {
  832. doctype = NewNode();
  833. doctype->type = DocTypeTag;
  834. doctype->next = root->content;
  835. doctype->parent = root;
  836. doctype->prev = null;
  837. root->content = doctype;
  838. }
  839. if (doctype_mode == doctype_user && doctype_str)
  840. {
  841. fpi = doctype_str;
  842. sysid = "";
  843. }
  844. lexer->txtstart = lexer->txtend = lexer->lexsize;
  845. /* add public identifier */
  846. AddStringLiteral(lexer, "html PUBLIC ");
  847. /* check if the fpi is quoted or not */
  848. if (fpi[0] == '"')
  849. AddStringLiteral(lexer, fpi);
  850. else
  851. {
  852. AddStringLiteral(lexer, "\"");
  853. AddStringLiteral(lexer, fpi);
  854. AddStringLiteral(lexer, "\"");
  855. }
  856. if ((unsigned)(wstrlen(sysid) + 6) >= wraplen)
  857. AddStringLiteral(lexer, "\n\"");
  858. else
  859. AddStringLiteral(lexer, "\n \"");
  860. /* add system identifier */
  861. AddStringLiteral(lexer, sysid);
  862. AddStringLiteral(lexer, "\"");
  863. lexer->txtend = lexer->lexsize;
  864. doctype->start = lexer->txtstart;
  865. doctype->end = lexer->txtend;
  866. return no;
  867. }
  868. int ApparentVersion(Lexer *lexer)
  869. {
  870. switch (lexer->doctype)
  871. {
  872. case VERS_UNKNOWN:
  873. return HTMLVersion(lexer);
  874. case VERS_HTML20:
  875. if (lexer->versions & VERS_HTML20)
  876. return VERS_HTML20;
  877. break;
  878. case VERS_HTML32:
  879. if (lexer->versions & VERS_HTML32)
  880. return VERS_HTML32;
  881. break; /* to replace old version by new */
  882. case VERS_HTML40_STRICT:
  883. if (lexer->versions & VERS_HTML40_STRICT)
  884. return VERS_HTML40_STRICT;
  885. break;
  886. case VERS_HTML40_LOOSE:
  887. if (lexer->versions & VERS_HTML40_LOOSE)
  888. return VERS_HTML40_LOOSE;
  889. break; /* to replace old version by new */
  890. case VERS_FRAMES:
  891. if (lexer->versions & VERS_FRAMES)
  892. return VERS_FRAMES;
  893. break;
  894. }
  895. ReportWarning(lexer, null, null, INCONSISTENT_VERSION);
  896. return HTMLVersion(lexer);
  897. }
  898. /* fixup doctype if missing */
  899. Bool FixDocType(Lexer *lexer, Node *root)
  900. {
  901. Node *doctype;
  902. int guessed = VERS_HTML40_STRICT, i;
  903. if (lexer->bad_doctype)
  904. ReportWarning(lexer, null, null, MALFORMED_DOCTYPE);
  905. if (XmlOut)
  906. return yes;
  907. doctype = FindDocType(root);
  908. if (doctype_mode == doctype_omit)
  909. {
  910. if (doctype)
  911. DiscardElement(doctype);
  912. return yes;
  913. }
  914. if (doctype_mode == doctype_strict)
  915. {
  916. DiscardElement(doctype);
  917. doctype = null;
  918. guessed = VERS_HTML40_STRICT;
  919. }
  920. else if (doctype_mode == doctype_loose)
  921. {
  922. DiscardElement(doctype);
  923. doctype = null;
  924. guessed = VERS_HTML40_LOOSE;
  925. }
  926. else if (doctype_mode == doctype_auto)
  927. {
  928. if (doctype)
  929. {
  930. if (lexer->doctype == VERS_UNKNOWN)
  931. return no;
  932. switch (lexer->doctype)
  933. {
  934. case VERS_UNKNOWN:
  935. return no;
  936. case VERS_HTML20:
  937. if (lexer->versions & VERS_HTML20)
  938. return yes;
  939. break; /* to replace old version by new */
  940. case VERS_HTML32:
  941. if (lexer->versions & VERS_HTML32)
  942. return yes;
  943. break; /* to replace old version by new */
  944. case VERS_HTML40_STRICT:
  945. if (lexer->versions & VERS_HTML40_STRICT)
  946. return yes;
  947. break; /* to replace old version by new */
  948. case VERS_HTML40_LOOSE:
  949. if (lexer->versions & VERS_HTML40_LOOSE)
  950. return yes;
  951. break; /* to replace old version by new */
  952. case VERS_FRAMES:
  953. if (lexer->versions & VERS_FRAMES)
  954. return yes;
  955. break; /* to replace old version by new */
  956. }
  957. /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
  958. }
  959. /* choose new doctype */
  960. guessed = HTMLVersion(lexer);
  961. }
  962. if (guessed == VERS_UNKNOWN)
  963. return no;
  964. /* for XML use the Voyager system identifier */
  965. if (XmlOut || XmlTags || lexer->isvoyager)
  966. {
  967. if (doctype)
  968. DiscardElement(doctype);
  969. for (i = 0; i < W3C_VERSIONS; ++i)
  970. {
  971. if (guessed == W3C_Version[i].code)
  972. {
  973. FixHTMLNameSpace(lexer, root, W3C_Version[i].profile);
  974. break;
  975. }
  976. }
  977. return yes;
  978. }
  979. if (!doctype)
  980. {
  981. doctype = NewNode();
  982. doctype->type = DocTypeTag;
  983. doctype->next = root->content;
  984. doctype->parent = root;
  985. doctype->prev = null;
  986. root->content = doctype;
  987. }
  988. lexer->txtstart = lexer->txtend = lexer->lexsize;
  989. /* use the appropriate public identifier */
  990. AddStringLiteral(lexer, "html PUBLIC ");
  991. if (doctype_mode == doctype_user && doctype_str)
  992. AddStringLiteral(lexer, doctype_str);
  993. else if (guessed == VERS_HTML20)
  994. AddStringLiteral(lexer, "\"-//IETF//DTD HTML 2.0//EN\"");
  995. else
  996. {
  997. AddStringLiteral(lexer, "\"-//W3C//DTD ");
  998. for (i = 0; i < W3C_VERSIONS; ++i)
  999. {
  1000. if (guessed == W3C_Version[i].code)
  1001. {
  1002. AddStringLiteral(lexer, W3C_Version[i].name);
  1003. break;
  1004. }
  1005. }
  1006. AddStringLiteral(lexer, "//EN\"");
  1007. }
  1008. lexer->txtend = lexer->lexsize;
  1009. doctype->start = lexer->txtstart;
  1010. doctype->end = lexer->txtend;
  1011. return yes;
  1012. }
  1013. /* ensure XML document starts with <?XML version="1.0"?> */
  1014. Bool FixXMLPI(Lexer *lexer, Node *root)
  1015. {
  1016. Node *xml;
  1017. char *s;
  1018. if( root->content && root->content->type == ProcInsTag)
  1019. {
  1020. s = &lexer->lexbuf[root->content->start];
  1021. if (s[0] == 'x' && s[1] == 'm' && s[2] == 'l')
  1022. return yes;
  1023. }
  1024. xml = NewNode();
  1025. xml->type = ProcInsTag;
  1026. xml->next = root->content;
  1027. if (root->content)
  1028. {
  1029. root->content->prev = xml;
  1030. xml->next = root->content;
  1031. }
  1032. root->content = xml;
  1033. lexer->txtstart = lexer->txtend = lexer->lexsize;
  1034. AddStringLiteral(lexer, "xml version=\"1.0\"");
  1035. lexer->txtend = lexer->lexsize;
  1036. xml->start = lexer->txtstart;
  1037. xml->end = lexer->txtend;
  1038. return no;
  1039. }
  1040. Node *InferredTag(Lexer *lexer, char *name)
  1041. {
  1042. Node *node;
  1043. node = NewNode();
  1044. node->type = StartTag;
  1045. node->implicit = yes;
  1046. node->element = wstrdup(name);
  1047. node->start = lexer->txtstart;
  1048. node->end = lexer->txtend;
  1049. FindTag(node);
  1050. return node;
  1051. }
  1052. static Bool ExpectsContent(Node *node)
  1053. {
  1054. if (node->type != StartTag)
  1055. return no;
  1056. /* unknown element? */
  1057. if (node->tag == null)
  1058. return yes;
  1059. if (node->tag->model & CM_EMPTY)
  1060. return no;
  1061. return yes;
  1062. }
  1063. /*
  1064. create a text node for the contents of
  1065. a CDATA element like style or script
  1066. which ends with </foo> for some foo.
  1067. */
  1068. Node *GetCDATA(Lexer *lexer, Node *container)
  1069. {
  1070. int c, lastc, start, i, len;
  1071. Bool endtag = no;
  1072. lexer->lines = lexer->in->curline;
  1073. lexer->columns = lexer->in->curcol;
  1074. lexer->waswhite = no;
  1075. lexer->txtstart = lexer->txtend = lexer->lexsize;
  1076. lastc = '\0';
  1077. start = -1;
  1078. while ((c = ReadChar(lexer->in)) != EndOfStream)
  1079. {
  1080. /* treat \r\n as \n and \r as \n */
  1081. if (c == '/' && lastc == '<')
  1082. {
  1083. if (endtag)
  1084. {
  1085. lexer->lines = lexer->in->curline;
  1086. lexer->columns = lexer->in->curcol - 3;
  1087. ReportWarning(lexer, null, null, BAD_CDATA_CONTENT);
  1088. }
  1089. start = lexer->lexsize + 1; /* to first letter */
  1090. endtag = yes;
  1091. }
  1092. else if (c == '>' && start >= 0)
  1093. {
  1094. if (((len = lexer->lexsize - start) == wstrlen(container->element)) &&
  1095. wstrncasecmp(lexer->lexbuf+start, container->element, len) == 0)
  1096. {
  1097. lexer->txtend = start - 2;
  1098. break;
  1099. }
  1100. lexer->lines = lexer->in->curline;
  1101. lexer->columns = lexer->in->curcol - 3;
  1102. ReportWarning(lexer, null, null, BAD_CDATA_CONTENT);
  1103. /* if javascript insert backslash before / */
  1104. if (IsJavaScript(container))
  1105. {
  1106. for (i = lexer->lexsize; i > start-1; --i)
  1107. lexer->lexbuf[i] = lexer->lexbuf[i-1];
  1108. lexer->lexbuf[start-1] = '\\';
  1109. lexer->lexsize++;
  1110. }
  1111. start = -1;
  1112. }
  1113. else if (c == '\r')
  1114. {
  1115. c = ReadChar(lexer->in);
  1116. if (c != '\n')
  1117. UngetChar(c, lexer->in);
  1118. c = '\n';
  1119. }
  1120. AddCharToLexer(lexer, (uint)c);
  1121. lexer->txtend = lexer->lexsize;
  1122. lastc = c;
  1123. }
  1124. if (c == EndOfStream)
  1125. ReportWarning(lexer, container, null, MISSING_ENDTAG_FOR);
  1126. if (lexer->txtend > lexer->txtstart)
  1127. return lexer->token = TextToken(lexer);
  1128. return null;
  1129. }
  1130. void UngetToken(Lexer *lexer)
  1131. {
  1132. lexer->pushed = yes;
  1133. }
  1134. /*
  1135. modes for GetToken()
  1136. MixedContent -- for elements which don't accept PCDATA
  1137. Preformatted -- white space preserved as is
  1138. IgnoreMarkup -- for CDATA elements such as script, style
  1139. */
  1140. Node *GetToken(Lexer *lexer, uint mode)
  1141. {
  1142. uint map;
  1143. int c, lastc, badcomment = 0;
  1144. Bool isempty;
  1145. AttVal *attributes;
  1146. if (lexer->pushed)
  1147. {
  1148. /* duplicate inlines in preference to pushed text nodes when appropriate */
  1149. if (lexer->token->type != TextNode || (!lexer->insert && !lexer->inode))
  1150. {
  1151. lexer->pushed = no;
  1152. return lexer->token;
  1153. }
  1154. }
  1155. /* at start of block elements, unclosed inline
  1156. elements are inserted into the token stream */
  1157. if (lexer->insert || lexer->inode)
  1158. return InsertedToken(lexer);
  1159. lexer->lines = lexer->in->curline;
  1160. lexer->columns = lexer->in->curcol;
  1161. lexer->waswhite = no;
  1162. lexer->txtstart = lexer->txtend = lexer->lexsize;
  1163. while ((c = ReadChar(lexer->in)) != EndOfStream)
  1164. {
  1165. if (lexer->insertspace && mode != IgnoreWhitespace)
  1166. {
  1167. AddCharToLexer(lexer, ' ');
  1168. lexer->waswhite = yes;
  1169. lexer->insertspace = no;
  1170. }
  1171. /* treat \r\n as \n and \r as \n */
  1172. if (c == '\r')
  1173. {
  1174. c = ReadChar(lexer->in);
  1175. if (c != '\n')
  1176. UngetChar(c, lexer->in);
  1177. c = '\n';
  1178. }
  1179. AddCharToLexer(lexer, (uint)c);
  1180. switch (lexer->state)
  1181. {
  1182. case LEX_CONTENT: /* element content */
  1183. map = MAP(c);
  1184. /*
  1185. Discard white space if appropriate. Its cheaper
  1186. to do this here rather than in parser methods
  1187. for elements that don't have mixed content.
  1188. */
  1189. if ((map & white) && (mode == IgnoreWhitespace)
  1190. && lexer->lexsize == lexer->txtstart + 1)
  1191. {
  1192. --(lexer->lexsize);
  1193. lexer->waswhite = no;
  1194. lexer->lines = lexer->in->curline;
  1195. lexer->columns = lexer->in->curcol;
  1196. continue;
  1197. }
  1198. if (c == '<')
  1199. {
  1200. lexer->state = LEX_GT;
  1201. continue;
  1202. }
  1203. if ((map & white) != 0)
  1204. {
  1205. /* was previous char white? */
  1206. if (lexer->waswhite)
  1207. {
  1208. if (mode != Preformatted && mode != IgnoreMarkup)
  1209. {
  1210. --(lexer->lexsize);
  1211. lexer->lines = lexer->in->curline;
  1212. lexer->columns = lexer->in->curcol;
  1213. }
  1214. }
  1215. else /* prev char wasn't white */
  1216. {
  1217. lexer->waswhite = yes;
  1218. lastc = c;
  1219. if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
  1220. ChangeChar(lexer, ' ');
  1221. }
  1222. continue;
  1223. }
  1224. else if (c == '&' && mode != IgnoreMarkup)
  1225. ParseEntity(lexer, mode);
  1226. /* this is needed to avoid trimming trailing whitespace */
  1227. if (mode == IgnoreWhitespace)
  1228. mode = MixedContent;
  1229. lexer->waswhite = no;
  1230. continue;
  1231. case LEX_GT: /* < */
  1232. /* check for endtag */
  1233. if (c == '/')
  1234. {
  1235. if ((c = ReadChar(lexer->in)) == EndOfStream)
  1236. {
  1237. UngetChar(c, lexer->in);
  1238. continue;
  1239. }
  1240. AddCharToLexer(lexer, c);
  1241. map = MAP(c);
  1242. if ((map & letter) != 0)
  1243. {
  1244. lexer->lexsize -= 3;
  1245. lexer->txtend = lexer->lexsize;
  1246. UngetChar(c, lexer->in);
  1247. lexer->state = LEX_ENDTAG;
  1248. lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */
  1249. lexer->in->curcol -= 2;
  1250. /* if some text before the </ return it now */
  1251. if (lexer->txtend > lexer->txtstart)
  1252. {
  1253. /* trim space char before end tag */
  1254. if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
  1255. {
  1256. lexer->lexsize -= 1;
  1257. lexer->txtend = lexer->lexsize;
  1258. }
  1259. return lexer->token = TextToken(lexer);
  1260. }
  1261. continue; /* no text so keep going */
  1262. }
  1263. /* otherwise treat as CDATA */
  1264. lexer->waswhite = no;
  1265. lexer->state = LEX_CONTENT;
  1266. continue;
  1267. }
  1268. if (mode == IgnoreMarkup)
  1269. {
  1270. /* otherwise treat as CDATA */
  1271. lexer->waswhite = no;
  1272. lexer->state = LEX_CONTENT;
  1273. continue;
  1274. }
  1275. /*
  1276. look out for comments, doctype or marked sections
  1277. this isn't quite right, but its getting there ...
  1278. */
  1279. if (c == '!')
  1280. {
  1281. c = ReadChar(lexer->in);
  1282. if (c == '-')
  1283. {
  1284. c = ReadChar(lexer->in);
  1285. if (c == '-')
  1286. {
  1287. lexer->state = LEX_COMMENT; /* comment */
  1288. lexer->lexsize -= 2;
  1289. lexer->txtend = lexer->lexsize;
  1290. /* if some text before < return it now */
  1291. if (lexer->txtend > lexer->txtstart)
  1292. return lexer->token = TextToken(lexer);
  1293. lexer->txtstart = lexer->lexsize;
  1294. continue;
  1295. }
  1296. ReportWarning(lexer, null, null, MALFORMED_COMMENT);
  1297. }
  1298. else if (c == 'd' || c == 'D')
  1299. {
  1300. lexer->state = LEX_DOCTYPE; /* doctype */
  1301. lexer->lexsize -= 2;
  1302. lexer->txtend = lexer->lexsize;
  1303. mode = IgnoreWhitespace;
  1304. /* skip until white space or '>' */
  1305. for (;;)
  1306. {
  1307. c = ReadChar(lexer->in);
  1308. if (c == EndOfStream || c == '>')
  1309. {
  1310. UngetChar(c, lexer->in);
  1311. break;
  1312. }
  1313. map = MAP(c);
  1314. if (!(map & white))
  1315. continue;
  1316. /* and skip to end of whitespace */
  1317. for (;;)
  1318. {
  1319. c = ReadChar(lexer->in);
  1320. if (c == EndOfStream || c == '>')
  1321. {
  1322. UngetChar(c, lexer->in);
  1323. break;
  1324. }
  1325. map = MAP(c);
  1326. if (map & white)
  1327. continue;
  1328. UngetChar(c, lexer->in);
  1329. break;
  1330. }
  1331. break;
  1332. }
  1333. /* if some text before < return it now */
  1334. if (lexer->txtend > lexer->txtstart)
  1335. return lexer->token = TextToken(lexer);
  1336. lexer->txtstart = lexer->lexsize;
  1337. continue;
  1338. }
  1339. else if (c == '[')
  1340. {
  1341. /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
  1342. lexer->lexsize -= 2;
  1343. lexer->state = LEX_SECTION;
  1344. lexer->txtend = lexer->lexsize;
  1345. /* if some text before < return it now */
  1346. if (lexer->txtend > lexer->txtstart)
  1347. return lexer->token = TextToken(lexer);
  1348. lexer->txtstart = lexer->lexsize;
  1349. continue;
  1350. }
  1351. /* otherwise swallow chars up to and including next '>' */
  1352. while ((c = ReadChar(lexer->in)) != '>')
  1353. {
  1354. if (c == -1)
  1355. {
  1356. UngetChar(c, lexer->in);
  1357. break;
  1358. }
  1359. }
  1360. lexer->lexsize -= 2;
  1361. lexer->lexbuf[lexer->lexsize] = '\0';
  1362. lexer->state = LEX_CONTENT;
  1363. continue;
  1364. }
  1365. /*
  1366. processing instructions
  1367. */
  1368. if (c == '?')
  1369. {
  1370. lexer->lexsize -= 2;
  1371. lexer->state = LEX_PROCINSTR;
  1372. lexer->txtend = lexer->lexsize;
  1373. /* if some text before < return it now */
  1374. if (lexer->txtend > lexer->txtstart)
  1375. return lexer->token = TextToken(lexer);
  1376. lexer->txtstart = lexer->lexsize;
  1377. continue;
  1378. }
  1379. /* Microsoft ASP's e.g. <% ... server-code ... %> */
  1380. if (c == '%')
  1381. {
  1382. lexer->lexsize -= 2;
  1383. lexer->state = LEX_ASP;
  1384. lexer->txtend = lexer->lexsize;
  1385. /* if some text before < return it now */
  1386. if (lexer->txtend > lexer->txtstart)
  1387. return lexer->token = TextToken(lexer);
  1388. lexer->txtstart = lexer->lexsize;
  1389. continue;
  1390. }
  1391. /* Netscapes JSTE e.g. <# ... server-code ... #> */
  1392. if (c == '#')
  1393. {
  1394. lexer->lexsize -= 2;
  1395. lexer->state = LEX_JSTE;
  1396. lexer->txtend = lexer->lexsize;
  1397. /* if some text before < return it now */
  1398. if (lexer->txtend > lexer->txtstart)
  1399. return lexer->token = TextToken(lexer);
  1400. lexer->txtstart = lexer->lexsize;
  1401. continue;
  1402. }
  1403. map = MAP(c);
  1404. /* check for start tag */
  1405. if ((map & letter) != 0)
  1406. {
  1407. UngetChar(c, lexer->in); /* push back letter */
  1408. lexer->lexsize -= 2; /* discard "<" + letter */
  1409. lexer->txtend = lexer->lexsize;
  1410. lexer->state = LEX_STARTTAG; /* ready to read tag name */
  1411. /* if some text before < return it now */
  1412. if (lexer->txtend > lexer->txtstart)
  1413. return lexer->token = TextToken(lexer);
  1414. continue; /* no text so keep going */
  1415. }
  1416. /* otherwise treat as CDATA */
  1417. lexer->state = LEX_CONTENT;
  1418. lexer->waswhite = no;
  1419. continue;
  1420. case LEX_ENDTAG: /* </letter */
  1421. lexer->txtstart = lexer->lexsize - 1;
  1422. lexer->in->curcol += 2;
  1423. c = ParseTagName(lexer);
  1424. lexer->token = TagToken(lexer, EndTag); /* create endtag token */
  1425. lexer->lexsize = lexer->txtend = lexer->txtstart;
  1426. /* skip to '>' */
  1427. while (c != '>')
  1428. {
  1429. c = ReadChar(lexer->in);
  1430. if (c == EndOfStream)
  1431. break;
  1432. }
  1433. if (c == EndOfStream)
  1434. {
  1435. UngetChar(c, lexer->in);
  1436. continue;
  1437. }
  1438. lexer->state = LEX_CONTENT;
  1439. lexer->waswhite = no;
  1440. return lexer->token; /* the endtag token */
  1441. case LEX_STARTTAG: /* first letter of tagname */
  1442. lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
  1443. c = ParseTagName(lexer);
  1444. isempty = no;
  1445. attributes = null;
  1446. lexer->token = TagToken(lexer, (isempty ? StartEndTag : StartTag));
  1447. /* parse attributes, consuming closing ">" */
  1448. if (c != '>')
  1449. {
  1450. if (c == '/')
  1451. UngetChar(c, lexer->in);
  1452. attributes = ParseAttrs(lexer, &isempty);
  1453. }
  1454. if (isempty)
  1455. lexer->token->type = StartEndTag;
  1456. lexer->token->attributes = attributes;
  1457. lexer->lexsize = lexer->txtend = lexer->txtstart;
  1458. /* swallow newline following start tag */
  1459. /* special check needed for CRLF sequence */
  1460. /* this doesn't apply to empty elements */
  1461. if (ExpectsContent(lexer->token) ||
  1462. lexer->token->tag == tag_br)
  1463. {
  1464. c = ReadChar(lexer->in);
  1465. if (c == '\r')
  1466. {
  1467. c = ReadChar(lexer->in);
  1468. if (c != '\n')
  1469. UngetChar(c, lexer->in);
  1470. }
  1471. else if (c != '\n' && c != '\f')
  1472. UngetChar(c, lexer->in);
  1473. lexer->waswhite = yes; /* to swallow leading whitespace */
  1474. }
  1475. else
  1476. lexer->waswhite = no;
  1477. lexer->state = LEX_CONTENT;
  1478. if (lexer->token->tag == null)
  1479. ReportError(lexer, null, lexer->token, UNKNOWN_ELEMENT);
  1480. else if (!XmlTags)
  1481. {
  1482. lexer->versions &= lexer->token->tag->versions;
  1483. if (lexer->token->tag->versions & VERS_PROPRIETARY)
  1484. {
  1485. if (!MakeClean && (lexer->token->tag == tag_nobr ||
  1486. lexer->token->tag == tag_wbr))
  1487. ReportWarning(lexer, null, lexer->token, PROPRIETARY_ELEMENT);
  1488. }
  1489. if (lexer->token->tag->chkattrs)
  1490. {
  1491. CheckUniqueAttributes(lexer, lexer->token);
  1492. lexer->token->tag->chkattrs(lexer, lexer->token);
  1493. }
  1494. else
  1495. CheckAttributes(lexer, lexer->token);
  1496. }
  1497. return lexer->token; /* return start tag */
  1498. case LEX_COMMENT: /* seen <!-- so look for --> */
  1499. if (c != '-')
  1500. continue;
  1501. c = ReadChar(lexer->in);
  1502. AddCharToLexer(lexer, c);
  1503. if (c != '-')
  1504. continue;
  1505. end_comment:
  1506. c = ReadChar(lexer->in);
  1507. if (c == '>')
  1508. {
  1509. if (badcomment)
  1510. ReportWarning(lexer, null, null, MALFORMED_COMMENT);
  1511. lexer->txtend = lexer->lexsize;
  1512. lexer->lexbuf[lexer->lexsize] = '\0';
  1513. lexer->state = LEX_CONTENT;
  1514. lexer->waswhite = no;
  1515. lexer->token = CommentToken(lexer);
  1516. /* now look for a line break */
  1517. c = ReadChar(lexer->in);
  1518. if (c == '\r')
  1519. {
  1520. c = ReadChar(lexer->in);
  1521. if (c != '\n')
  1522. lexer->token->linebreak = yes;
  1523. }
  1524. if (c == '\n')
  1525. lexer->token->linebreak = yes;
  1526. else
  1527. UngetChar(c, lexer->in);
  1528. return lexer->token;
  1529. }
  1530. /* note position of first such error in the comment */
  1531. if (!badcomment)
  1532. {
  1533. lexer->lines = lexer->in->curline;
  1534. lexer->columns = lexer->in->curcol - 3;
  1535. }
  1536. badcomment++;
  1537. if (FixComments)
  1538. lexer->lexbuf[lexer->lexsize - 2] = '=';
  1539. AddCharToLexer(lexer, c);
  1540. /* if '-' then look for '>' to end the comment */
  1541. if (c == '-')
  1542. goto end_comment;
  1543. /* otherwise continue to look for --> */
  1544. lexer->lexbuf[lexer->lexsize - 2] = '=';
  1545. continue;
  1546. case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
  1547. map = MAP(c);
  1548. if (map & white)
  1549. {
  1550. if (lexer->waswhite)
  1551. lexer->lexsize -= 1;
  1552. lexer->waswhite = yes;
  1553. }
  1554. else
  1555. lexer->waswhite = no;
  1556. if (c != '>')
  1557. continue;
  1558. lexer->lexsize -= 1;
  1559. lexer->txtend = lexer->lexsize;
  1560. lexer->lexbuf[lexer->lexsize] = '\0';
  1561. lexer->state = LEX_CONTENT;
  1562. lexer->waswhite = no;
  1563. lexer->token = DocTypeToken(lexer);
  1564. /* make a note of the version named by the doctype */
  1565. lexer->doctype = FindGivenVersion(lexer, lexer->token);
  1566. return lexer->token;
  1567. case LEX_PROCINSTR: /* seen <? so look for '>' */
  1568. /* check for PHP preprocessor instructions <?php ... ?> */
  1569. if (lexer->lexsize - lexer->txtstart == 3)
  1570. {
  1571. if (wstrncmp(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
  1572. {
  1573. lexer->state = LEX_PHP;
  1574. continue;
  1575. }
  1576. }
  1577. if (XmlPIs) /* insist on ?> as terminator */
  1578. {
  1579. if (c != '?')
  1580. continue;
  1581. /* now look for '>' */
  1582. c = ReadChar(lexer->in);
  1583. if (c == EndOfStream)
  1584. {
  1585. ReportWarning(lexer, null, null, UNEXPECTED_END_OF_FILE);
  1586. UngetChar(c, lexer->in);
  1587. continue;
  1588. }
  1589. AddCharToLexer(lexer, c);
  1590. }
  1591. if (c != '>')
  1592. continue;
  1593. lexer->lexsize -= 1;
  1594. lexer->txtend = lexer->lexsize;
  1595. lexer->lexbuf[lexer->lexsize] = '\0';
  1596. lexer->state = LEX_CONTENT;
  1597. lexer->waswhite = no;
  1598. return lexer->token = PIToken(lexer);
  1599. case LEX_ASP: /* seen <% so look for "%>" */
  1600. if (c != '%')
  1601. continue;
  1602. /* now look for '>' */
  1603. c = ReadChar(lexer->in);
  1604. if (c != '>')
  1605. {
  1606. UngetChar(c, lexer->in);
  1607. continue;
  1608. }
  1609. lexer->lexsize -= 1;
  1610. lexer->txtend = lexer->lexsize;
  1611. lexer->lexbuf[lexer->lexsize] = '\0';
  1612. lexer->state = LEX_CONTENT;
  1613. lexer->waswhite = no;
  1614. return lexer->token = AspToken(lexer);
  1615. case LEX_JSTE: /* seen <# so look for "#>" */
  1616. if (c != '#')
  1617. continue;
  1618. /* now look for '>' */
  1619. c = ReadChar(lexer->in);
  1620. if (c != '>')
  1621. {
  1622. UngetChar(c, lexer->in);
  1623. continue;
  1624. }
  1625. lexer->lexsize -= 1;
  1626. lexer->txtend = lexer->lexsize;
  1627. lexer->lexbuf[lexer->lexsize] = '\0';
  1628. lexer->state = LEX_CONTENT;
  1629. lexer->waswhite = no;
  1630. return lexer->token = JsteToken(lexer);
  1631. case LEX_PHP: /* seen "<?php" so look for "?>" */
  1632. if (c != '?')
  1633. continue;
  1634. /* now look for '>' */
  1635. c = ReadChar(lexer->in);
  1636. if (c != '>')
  1637. {
  1638. UngetChar(c, lexer->in);
  1639. continue;
  1640. }
  1641. lexer->lexsize -= 1;
  1642. lexer->txtend = lexer->lexsize;
  1643. lexer->lexbuf[lexer->lexsize] = '\0';
  1644. lexer->state = LEX_CONTENT;
  1645. lexer->waswhite = no;
  1646. return lexer->token = PhpToken(lexer);
  1647. case LEX_SECTION: /* seen "<![" so look for "]>" */
  1648. if (c == '[')
  1649. {
  1650. if (lexer->lexsize == (lexer->txtstart + 6) &&
  1651. wstrncmp(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
  1652. {
  1653. lexer->state = LEX_CDATA;
  1654. lexer->lexsize -= 6;
  1655. continue;
  1656. }
  1657. }
  1658. if (c != ']')
  1659. continue;
  1660. /* now look for '>' */
  1661. c = ReadChar(lexer->in);
  1662. if (c != '>')
  1663. {
  1664. UngetChar(c, lexer->in);
  1665. continue;
  1666. }
  1667. lexer->lexsize -= 1;
  1668. lexer->txtend = lexer->lexsize;
  1669. lexer->lexbuf[lexer->lexsize] = '\0';
  1670. lexer->state = LEX_CONTENT;
  1671. lexer->waswhite = no;
  1672. return lexer->token = SectionToken(lexer);
  1673. case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
  1674. if (c != ']')
  1675. continue;
  1676. /* now look for ']' */
  1677. c = ReadChar(lexer->in);
  1678. if (c != ']')
  1679. {
  1680. UngetChar(c, lexer->in);
  1681. continue;
  1682. }
  1683. /* now look for '>' */
  1684. c = ReadChar(lexer->in);
  1685. if (c != '>')
  1686. {
  1687. UngetChar(c, lexer->in);
  1688. continue;
  1689. }
  1690. lexer->lexsize -= 1;
  1691. lexer->txtend = lexer->lexsize;
  1692. lexer->lexbuf[lexer->lexsize] = '\0';
  1693. lexer->state = LEX_CONTENT;
  1694. lexer->waswhite = no;
  1695. return lexer->token = CDATAToken(lexer);
  1696. }
  1697. }
  1698. if (lexer->state == LEX_CONTENT) /* text string */
  1699. {
  1700. lexer->txtend = lexer->lexsize;
  1701. if (lexer->txtend > lexer->txtstart)
  1702. {
  1703. UngetChar(c, lexer->in);
  1704. if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
  1705. {
  1706. lexer->lexsize -= 1;
  1707. lexer->txtend = lexer->lexsize;
  1708. }
  1709. return lexer->token = TextToken(lexer);
  1710. }
  1711. }
  1712. else if (lexer->state == LEX_COMMENT) /* comment */
  1713. {
  1714. if (c == EndOfStream)
  1715. ReportWarning(lexer, null, null, MALFORMED_COMMENT);
  1716. lexer->txtend = lexer->lexsize;
  1717. lexer->lexbuf[lexer->lexsize] = '\0';
  1718. lexer->state = LEX_CONTENT;
  1719. lexer->waswhite = no;
  1720. return lexer->token = CommentToken(lexer);
  1721. }
  1722. return 0;
  1723. }
  1724. static void MapStr(char *str, uint code)
  1725. {
  1726. uint i;
  1727. while (*str)
  1728. {
  1729. i = (uint)(*str++);
  1730. lexmap[i] |= code;
  1731. }
  1732. }
  1733. void InitMap(void)
  1734. {
  1735. MapStr("\r\n\f", newline|white);
  1736. MapStr(" \t", white);
  1737. MapStr("-.:_", namechar);
  1738. MapStr("0123456789", digit|namechar);
  1739. MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
  1740. MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
  1741. }
  1742. /*
  1743. parser for ASP within start tags
  1744. Some people use ASP for to customize attributes
  1745. Tidy isn't really well suited to dealing with ASP
  1746. This is a workaround for attributes, but won't
  1747. deal with the case where the ASP is used to tailor
  1748. the attribute value. Here is an example of a work
  1749. around for using ASP in attribute values:
  1750. href="<%=rsSchool.Fields("ID").Value%>"
  1751. where the ASP that generates the attribute value
  1752. is masked from Tidy by the quotemarks.
  1753. */
  1754. static Node *ParseAsp(Lexer *lexer)
  1755. {
  1756. uint c;
  1757. Node *asp = null;
  1758. lexer->txtstart = lexer->lexsize;
  1759. for (;;)
  1760. {
  1761. c = ReadChar(lexer->in);
  1762. AddCharToLexer(lexer, c);
  1763. if (c != '%')
  1764. continue;
  1765. c = ReadChar(lexer->in);
  1766. AddCharToLexer(lexer, c);
  1767. if (c == '>')
  1768. break;
  1769. }
  1770. lexer->lexsize -= 2;
  1771. lexer->txtend = lexer->lexsize;
  1772. if (lexer->txtend > lexer->txtstart)
  1773. asp = AspToken(lexer);
  1774. lexer->txtstart = lexer->txtend;
  1775. return asp;
  1776. }
  1777. /*
  1778. PHP is like ASP but is based upon XML
  1779. processing instructions, e.g. <?php ... ?>
  1780. */
  1781. static Node *ParsePhp(Lexer *lexer)
  1782. {
  1783. uint c;
  1784. Node *php = null;
  1785. lexer->txtstart = lexer->lexsize;
  1786. for (;;)
  1787. {
  1788. c = ReadChar(lexer->in);
  1789. AddCharToLexer(lexer, c);
  1790. if (c != '?')
  1791. continue;
  1792. c = ReadChar(lexer->in);
  1793. AddCharToLexer(lexer, c);
  1794. if (c == '>')
  1795. break;
  1796. }
  1797. lexer->lexsize -= 2;
  1798. lexer->txtend = lexer->lexsize;
  1799. if (lexer->txtend > lexer->txtstart)
  1800. php = PhpToken(lexer);
  1801. lexer->txtstart = lexer->txtend;
  1802. return php;
  1803. }
  1804. /* consumes the '>' terminating start tags */
  1805. static char *ParseAttribute(Lexer *lexer, Bool *isempty,
  1806. Node **asp, Node **php)
  1807. {
  1808. int map, start, len = 0;
  1809. char *attr;
  1810. uint c;
  1811. *asp = null; /* clear asp pointer */
  1812. *php = null; /* clear php pointer */
  1813. /* skip white space before the attribute */
  1814. for (;;)
  1815. {
  1816. c = ReadChar(lexer->in);
  1817. if (c == '/')
  1818. {
  1819. c = ReadChar(lexer->in);
  1820. if (c == '>')
  1821. {
  1822. *isempty = yes;
  1823. return null;
  1824. }
  1825. UngetChar(c, lexer->in);
  1826. c = '/';
  1827. break;
  1828. }
  1829. if (c == '>')
  1830. return null;
  1831. if (c =='<')
  1832. {
  1833. c = ReadChar(lexer->in);
  1834. if (c == '%')
  1835. {
  1836. *asp = ParseAsp(lexer);
  1837. return null;
  1838. }
  1839. else if (c == '?')
  1840. {
  1841. *php = ParsePhp(lexer);
  1842. return null;
  1843. }
  1844. UngetChar(c, lexer->in);
  1845. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_GT);
  1846. return null;
  1847. }
  1848. if (c == '"' || c == '\'')
  1849. {
  1850. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_QUOTEMARK);
  1851. continue;
  1852. }
  1853. if (c == EndOfStream)
  1854. {
  1855. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_END_OF_FILE);
  1856. UngetChar(c, lexer->in);
  1857. return null;
  1858. }
  1859. map = MAP(c);
  1860. if ((map & white) == 0)
  1861. break;
  1862. }
  1863. start = lexer->lexsize;
  1864. for (;;)
  1865. {
  1866. /* but push back '=' for parseValue() */
  1867. if (c == '=' || c == '>')
  1868. {
  1869. UngetChar(c, lexer->in);
  1870. break;
  1871. }
  1872. if (c == '<' || c == EndOfStream)
  1873. {
  1874. UngetChar(c, lexer->in);
  1875. break;
  1876. }
  1877. map = MAP(c);
  1878. if ((map & white) != 0)
  1879. break;
  1880. /* what should be done about non-namechar characters? */
  1881. /* currently these are incorporated into the attr name */
  1882. if (!XmlTags && (map & uppercase) != 0)
  1883. c += (uint)('a' - 'A');
  1884. ++len;
  1885. AddCharToLexer(lexer, c);
  1886. c = ReadChar(lexer->in);
  1887. }
  1888. attr = (len > 0 ? wstrndup(lexer->lexbuf+start, len) : null);
  1889. lexer->lexsize = start;
  1890. return attr;
  1891. }
  1892. /*
  1893. invoked when < is seen in place of attribute value
  1894. but terminates on whitespace if not ASP, PHP or Tango
  1895. this routine recognizes ' and " quoted strings
  1896. */
  1897. static int ParseServerInstruction(Lexer *lexer)
  1898. {
  1899. int c, map, delim = '"';
  1900. Bool isrule = no;
  1901. c = ReadChar(lexer->in);
  1902. AddCharToLexer(lexer, c);
  1903. /* check for ASP, PHP or Tango */
  1904. if (c == '%' || c == '?' || c == '@')
  1905. isrule = yes;
  1906. for (;;)
  1907. {
  1908. c = ReadChar(lexer->in);
  1909. if (c == EndOfStream)
  1910. break;
  1911. if (c == '>')
  1912. {
  1913. if (isrule)
  1914. AddCharToLexer(lexer, c);
  1915. else
  1916. UngetChar(c, lexer->in);
  1917. break;
  1918. }
  1919. /* if not recognized as ASP, PHP or Tango */
  1920. /* then also finish value on whitespace */
  1921. if (!isrule)
  1922. {
  1923. map = MAP(c);
  1924. if ((map & white) != 0)
  1925. break;
  1926. }
  1927. AddCharToLexer(lexer, c);
  1928. if (c == '"')
  1929. {
  1930. do
  1931. {
  1932. c = ReadChar(lexer->in);
  1933. AddCharToLexer(lexer, c);
  1934. }
  1935. while (c != '"');
  1936. delim = '\'';
  1937. continue;
  1938. }
  1939. if (c == '\'')
  1940. {
  1941. do
  1942. {
  1943. c = ReadChar(lexer->in);
  1944. AddCharToLexer(lexer, c);
  1945. }
  1946. while (c != '\'');
  1947. }
  1948. }
  1949. return delim;
  1950. }
  1951. /* values start with "=" or " = " etc. */
  1952. /* doesn't consume the ">" at end of start tag */
  1953. static char *ParseValue(Lexer *lexer, char *name,
  1954. Bool foldCase, Bool *isempty, int *pdelim)
  1955. {
  1956. int len = 0, start, map;
  1957. Bool seen_gt = no;
  1958. Bool munge = yes;
  1959. uint c, lastc, delim, quotewarning;
  1960. char *value;
  1961. delim = (char) 0;
  1962. *pdelim = '"';
  1963. /*
  1964. Henry Zrepa reports that some folk are using the
  1965. embed element with script attributes where newlines
  1966. are significant and must be preserved
  1967. */
  1968. if (LiteralAttribs)
  1969. munge = no;
  1970. /* skip white space before the '=' */
  1971. for (;;)
  1972. {
  1973. c = ReadChar(lexer->in);
  1974. if (c == EndOfStream)
  1975. {
  1976. UngetChar(c, lexer->in);
  1977. break;
  1978. }
  1979. map = MAP(c);
  1980. if ((map & white) == 0)
  1981. break;
  1982. }
  1983. /*
  1984. c should be '=' if there is a value
  1985. other legal possibilities are white
  1986. space, '/' and '>'
  1987. */
  1988. if (c != '=')
  1989. {
  1990. UngetChar(c, lexer->in);
  1991. return null;
  1992. }
  1993. /* skip white space after '=' */
  1994. for (;;)
  1995. {
  1996. c = ReadChar(lexer->in);
  1997. if (c == EndOfStream)
  1998. {
  1999. UngetChar(c, lexer->in);
  2000. break;
  2001. }
  2002. map = MAP(c);
  2003. if ((map & white) == 0)
  2004. break;
  2005. }
  2006. /* check for quote marks */
  2007. if (c == '"' || c == '\'')
  2008. delim = c;
  2009. else if (c == '<')
  2010. {
  2011. start = lexer->lexsize;
  2012. AddCharToLexer(lexer, c);
  2013. *pdelim = ParseServerInstruction(lexer);
  2014. len = lexer->lexsize - start;
  2015. lexer->lexsize = start;
  2016. return (len > 0 ? wstrndup(lexer->lexbuf+start, len) : null);
  2017. }
  2018. else
  2019. UngetChar(c, lexer->in);
  2020. /*
  2021. and read the value string
  2022. check for quote mark if needed
  2023. */
  2024. quotewarning = 0;
  2025. start = lexer->lexsize;
  2026. c = '\0';
  2027. for (;;)
  2028. {
  2029. lastc = c; /* track last character */
  2030. c = ReadChar(lexer->in);
  2031. if (c == EndOfStream)
  2032. {
  2033. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_END_OF_FILE);
  2034. UngetChar(c, lexer->in);
  2035. break;
  2036. }
  2037. if (delim == (char)0)
  2038. {
  2039. if (c == '>')
  2040. {
  2041. UngetChar(c, lexer->in);
  2042. break;
  2043. }
  2044. if (c == '"' || c == '\'')
  2045. {
  2046. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_QUOTEMARK);
  2047. break;
  2048. }
  2049. if (c == '<')
  2050. {
  2051. /* UngetChar(c, lexer->in); */
  2052. ReportAttrError(lexer, lexer->token, null, UNEXPECTED_GT);
  2053. /* break; */
  2054. }
  2055. /*
  2056. For cases like <br clear=all/> need to avoid treating /> as
  2057. part of the attribute value, however care is needed to avoid
  2058. so treating <a href=http://www.acme.com/> in this way, which
  2059. would map the <a> tag to <a href="http://www.acme.com"/>
  2060. */
  2061. if (c == '/')
  2062. {
  2063. /* peek ahead in case of /> */
  2064. c = ReadChar(lexer->in);
  2065. if (c == '>' && !IsUrl(name))
  2066. {
  2067. *isempty = yes;
  2068. UngetChar(c, lexer->in);
  2069. break;
  2070. }
  2071. /* unget peeked char */
  2072. UngetChar(c, lexer->in);
  2073. c = '/';
  2074. }
  2075. }
  2076. else /* delim is '\'' or '"' */
  2077. {
  2078. if (c == delim)
  2079. break;
  2080. /* treat CRLF, CR and LF as single line break */
  2081. if (c == '\r')
  2082. {
  2083. if ((c = ReadChar(lexer->in)) != '\n')
  2084. UngetChar(c, lexer->in);
  2085. c = '\n';
  2086. }
  2087. if (c == '\n' || c == '<' || c == '>')
  2088. ++quotewarning;
  2089. if (c == '>')
  2090. seen_gt = yes;
  2091. }
  2092. if (c == '&')
  2093. {
  2094. AddCharToLexer(lexer, c);
  2095. ParseEntity(lexer, null);
  2096. continue;
  2097. }
  2098. /*
  2099. kludge for JavaScript attribute values
  2100. with line continuations in string literals
  2101. */
  2102. if (c == '\\')
  2103. {
  2104. c = ReadChar(lexer->in);
  2105. if (c != '\n')
  2106. {
  2107. UngetChar(c, lexer->in);
  2108. c = '\\';
  2109. }
  2110. }
  2111. map = MAP(c);
  2112. if (map & white)
  2113. {
  2114. if (delim == (char)0)
  2115. break;
  2116. if (munge)
  2117. {
  2118. c = ' ';
  2119. if (lastc == ' ')
  2120. continue;
  2121. }
  2122. }
  2123. else if (foldCase && (map & uppercase) != 0)
  2124. c += (uint)('a' - 'A');
  2125. AddCharToLexer(lexer, c);
  2126. }
  2127. if (quotewarning > 10 && seen_gt && munge)
  2128. {
  2129. /*
  2130. there is almost certainly a missing trailling quote mark
  2131. as we have see too many newlines, < or > characters.
  2132. an exception is made for Javascript attributes and the
  2133. javascript URL scheme which may legitimately include < and >
  2134. */
  2135. if (!IsScript(name) &&
  2136. !(IsUrl(name) && wstrncmp(lexer->lexbuf+start, "javascript:", 11) == 0))
  2137. ReportError(lexer, null, null, SUSPECTED_MISSING_QUOTE);
  2138. }
  2139. len = lexer->lexsize - start;
  2140. lexer->lexsize = start;
  2141. if (len > 0 || delim)
  2142. value = wstrndup(lexer->lexbuf+start, len);
  2143. else
  2144. value = null;
  2145. /* note delimiter if given */
  2146. *pdelim = (delim ? delim : '"');
  2147. return value;
  2148. }
  2149. /* attr must be non-null */
  2150. Bool IsValidAttrName( char *attr)
  2151. {
  2152. uint map, c;
  2153. int i;
  2154. /* first character should be a letter */
  2155. c = attr[0];
  2156. map = MAP(c);
  2157. if (!(map & letter))
  2158. return no;
  2159. /* remaining characters should be namechars */
  2160. for( i = 1; i < wstrlen(attr); i++)
  2161. {
  2162. c = attr[i];
  2163. map = MAP(c);
  2164. if (map & namechar)
  2165. continue;
  2166. return no;
  2167. }
  2168. return yes;
  2169. }
  2170. /* create a new attribute */
  2171. AttVal *NewAttribute()
  2172. {
  2173. AttVal *av;
  2174. av = (AttVal *)MemAlloc(sizeof(AttVal));
  2175. av->next = null;
  2176. av->delim = '\0';
  2177. av->asp = null;
  2178. av->php = null;
  2179. av->attribute = null;
  2180. av->value = null;
  2181. av->dict = null;
  2182. return av;
  2183. }
  2184. /* swallows closing '>' */
  2185. AttVal *ParseAttrs(Lexer *lexer, Bool *isempty)
  2186. {
  2187. AttVal *av, *list;
  2188. char *attribute, *value;
  2189. int delim;
  2190. Node *asp, *php;
  2191. list = null;
  2192. for (; !EndOfInput(lexer);)
  2193. {
  2194. attribute = ParseAttribute(lexer, isempty, &asp, &php);
  2195. if (attribute == null)
  2196. {
  2197. /* check if attributes are created by ASP markup */
  2198. if (asp)
  2199. {
  2200. av = NewAttribute();
  2201. av->next = list;
  2202. av->asp = asp;
  2203. list = av;
  2204. continue;
  2205. }
  2206. /* check if attributes are created by PHP markup */
  2207. if (php)
  2208. {
  2209. av = NewAttribute();
  2210. av->next = list;
  2211. av->php = php;
  2212. list = av;
  2213. continue;
  2214. }
  2215. break;
  2216. }
  2217. value = ParseValue(lexer, attribute, no, isempty, &delim);
  2218. if (attribute && IsValidAttrName(attribute))
  2219. {
  2220. av = NewAttribute();
  2221. av->next = list;
  2222. av->delim = delim;
  2223. av->attribute = attribute;
  2224. av->value = value;
  2225. av->dict = FindAttribute(av);
  2226. list = av;
  2227. }
  2228. else
  2229. {
  2230. av = NewAttribute();
  2231. av->attribute = attribute;
  2232. av->value = value;
  2233. ReportAttrError(lexer, lexer->token, value, BAD_ATTRIBUTE_VALUE);
  2234. FreeAttribute(av);
  2235. }
  2236. }
  2237. return list;
  2238. }