PageRenderTime 51ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/amaya/html2thot.c

https://github.com/pffy/Amaya-Editor
C | 7618 lines | 6067 code | 333 blank | 1218 comment | 1612 complexity | 29dab6b5f25c8433321a1eb755e2bd6a MD5 | raw file
  1. /*
  2. *
  3. * (c) COPYRIGHT INRIA and W3C, 1996-2009
  4. * Please first read the full copyright statement in file COPYRIGHT.
  5. *
  6. */
  7. /*
  8. *
  9. * html2thot parses a HTML file and builds the corresponding abstract tree
  10. * for a Thot document of type HTML.
  11. *
  12. * Author: V. Quint
  13. * I. Vatton (W3C/INRIA): XML extension and Unicode
  14. */
  15. #define THOT_EXPORT extern
  16. #include "amaya.h"
  17. #include "css.h"
  18. #include "fetchHTMLname.h"
  19. #include "parser.h"
  20. #include "zlib.h"
  21. #include "AHTURLTools_f.h"
  22. #include "css_f.h"
  23. #include "EDITstyle_f.h"
  24. #include "fetchHTMLname_f.h"
  25. #include "fetchXMLname_f.h"
  26. #include "html2thot_f.h"
  27. #include "HTMLactions_f.h"
  28. #include "HTMLedit_f.h"
  29. #include "HTMLimage_f.h"
  30. #include "HTMLtable_f.h"
  31. #include "HTMLsave_f.h"
  32. #include "init_f.h"
  33. #include "styleparser_f.h"
  34. #include "UIcss_f.h"
  35. #include "XHTMLbuilder_f.h"
  36. #include "Xml2thot_f.h"
  37. #ifdef ANNOTATIONS
  38. #include "annotlib.h"
  39. #include "ANNOTtools_f.h"
  40. #endif /* ANNOTATIONS */
  41. /* tables defined in XHTMLbuilder.c */
  42. extern AttrValueMapping XhtmlAttrValueMappingTable[];
  43. extern XmlEntity XhtmlEntityTable[];
  44. typedef struct _UnicodeFallbackEntry
  45. {
  46. int unicodeVal; /* the Unicode code */
  47. int EightbitCode; /* the corresponding glyph to be used from
  48. the ISO Latin-1 or Symbol character set.
  49. if 0 < EightbitCode < 255, it's the Symbol code for the correct glyph
  50. if 1000 < EightbitCode < 1256, it's the ISO Latin-1 code + 1000 of an
  51. approaching glyph
  52. if 2000 < EightbitCode < 2256, it's the Symbol code + 2000 of an
  53. approaching glyph */
  54. }
  55. UnicodeFallbackEntry;
  56. UnicodeFallbackEntry UnicodeFallbackTable[] =
  57. {
  58. /* This table MUST be ordered according to the first field of each
  59. entry (Unicode code) */
  60. /* OElig */ {338, 1079}, /* latin capital ligature OE, U+0152 ISOlat2 */
  61. /* oelig */ {339, 1111}, /* latin small ligature oe, U+0153 ISOlat2 */
  62. /* Scaron */ {352, 1083}, /* latin capital letter S with caron, U+0160 ISOlat2 */
  63. /* scaron */ {353, 1115}, /* latin small letter s with caron, U+0161 ISOlat2 */
  64. /* Yuml */ {376, 1089}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */
  65. /* fnof */ {402, 166}, /* latin small f with hook = function = florin, U+0192 ISOtech */
  66. /* epsilon */ {603, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
  67. /* circ */ {710, 2217}, /* modifier letter circumflex accent, U+02C6 ISOpub */
  68. /* tilde */ {732, 1126}, /* small tilde, U+02DC ISOdia */
  69. /* hat */ {770, 1094}, /* small tilde, U+02DC ISOdia */
  70. /* UnderBar */ {818, 45}, /* U+0332 */
  71. /* Alpha */ {913, 65}, /* greek capital letter alpha, U+0391 */
  72. /* Beta */ {914, 66}, /* greek capital letter beta, U+0392 */
  73. /* Gamma */ {915, 71}, /* greek capital letter gamma, U+0393 ISOgrk3 */
  74. /* Delta */ {916, 68}, /* greek capital letter delta, U+0394 ISOgrk3 */
  75. /* Epsilon */ {917, 69}, /* greek capital letter epsilon, U+0395 */
  76. /* Zeta */ {918, 90}, /* greek capital letter zeta, U+0396 */
  77. /* Eta */ {919, 72}, /* greek capital letter eta, U+0397 */
  78. /* Theta */ {920, 81}, /* greek capital letter theta, U+0398 ISOgrk3 */
  79. /* Iota */ {921, 73}, /* greek capital letter iota, U+0399 */
  80. /* Kappa */ {922, 75}, /* greek capital letter kappa, U+039A */
  81. /* Lambda */ {923, 76}, /* greek capital letter lambda, U+039B ISOgrk3 */
  82. /* Mu */ {924, 77}, /* greek capital letter mu, U+039C */
  83. /* Nu */ {925, 78}, /* greek capital letter nu, U+039D */
  84. /* Xi */ {926, 88}, /* greek capital letter xi, U+039E ISOgrk3 */
  85. /* Omicron */ {927, 79}, /* greek capital letter omicron, U+039F */
  86. /* Pi */ {928, 80}, /* greek capital letter pi, U+03A0 ISOgrk3 */
  87. /* Rho */ {929, 82}, /* greek capital letter rho, U+03A1 */
  88. /* Sigma */ {931, 83}, /* greek capital letter sigma, U+03A3 ISOgrk3 */
  89. /* Tau */ {932, 84}, /* greek capital letter tau, U+03A4 */
  90. /* Upsilon */ {933, 85}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */
  91. /* Phi */ {934, 70}, /* greek capital letter phi, U+03A6 ISOgrk3 */
  92. /* Chi */ {935, 67}, /* greek capital letter chi, U+03A7 */
  93. /* Psi */ {936, 89}, /* greek capital letter psi, U+03A8 ISOgrk3 */
  94. /* Omega */ {937, 87}, /* greek capital letter omega, U+03A9 ISOgrk3 */
  95. /* alpha */ {945, 97}, /* greek small letter alpha, U+03B1 ISOgrk3 */
  96. /* beta */ {946, 98}, /* greek small letter beta, U+03B2 ISOgrk3 */
  97. /* gamma */ {947, 103}, /* greek small letter gamma, U+03B3 ISOgrk3 */
  98. /* delta */ {948, 100}, /* greek small letter delta, U+03B4 ISOgrk3 */
  99. /* epsilon */ {949, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
  100. /* zeta */ {950, 122}, /* greek small letter zeta, U+03B6 ISOgrk3 */
  101. /* eta */ {951, 104}, /* greek small letter eta, U+03B7 ISOgrk3 */
  102. /* theta */ {952, 113}, /* greek small letter theta, U+03B8 ISOgrk3 */
  103. /* iota */ {953, 105}, /* greek small letter iota, U+03B9 ISOgrk3 */
  104. /* kappa */ {954, 107}, /* greek small letter kappa, U+03BA ISOgrk3 */
  105. /* lambda */ {955, 108}, /* greek small letter lambda, U+03BB ISOgrk3 */
  106. /* mu */ {956, 109}, /* greek small letter mu, U+03BC ISOgrk3 */
  107. /* nu */ {957, 110}, /* greek small letter nu, U+03BD ISOgrk3 */
  108. /* xi */ {958, 120}, /* greek small letter xi, U+03BE ISOgrk3 */
  109. /* omicron */ {959, 111}, /* greek small letter omicron, U+03BF NEW */
  110. /* pi */ {960, 112}, /* greek small letter pi, U+03C0 ISOgrk3 */
  111. /* rho */ {961, 114}, /* greek small letter rho, U+03C1 ISOgrk3 */
  112. /* sigmaf */ {962, 86}, /* greek small letter final sigma, U+03C2 ISOgrk3 */
  113. /* sigma */ {963, 115}, /* greek small letter sigma, U+03C3 ISOgrk3 */
  114. /* tau */ {964, 116}, /* greek small letter tau, U+03C4 ISOgrk3 */
  115. /* upsilon */ {965, 117}, /* greek small letter upsilon, U+03C5 ISOgrk3 */
  116. /* phi */ {966, 106}, /* greek small letter phi, U+03C6 ISOgrk3 */
  117. /* chi */ {967, 99}, /* greek small letter chi, U+03C7 ISOgrk3 */
  118. /* psi */ {968, 121}, /* greek small letter psi, U+03C8 ISOgrk3 */
  119. /* omega */ {969, 119}, /* greek small letter omega, U+03C9 ISOgrk3 */
  120. /* thetasym */ {977, 74}, /* greek small letter theta symbol, U+03D1 NEW */
  121. /* upsih */ {978, 161}, /* greek upsilon with hook symbol, U+03D2 NEW */
  122. /* phiv */ {981, 102}, /* greek U+03D5 ISOgrk3 */
  123. /* piv */ {982, 118}, /* greek pi symbol, U+03D6 ISOgrk3 */
  124. /* ensp */ {8194, 1130}, /* en space, U+2002 ISOpub */
  125. /* emsp */ {8195, 1160}, /* em space, U+2003 ISOpub */
  126. /* thinsp */ {8201, 1129}, /* thin space, U+2009 ISOpub */
  127. /* zwnj */ {8204, 1063}, /* zero width non-joiner, U+200C NEW RFC 2070 */
  128. /* zwj */ {8205, 1063}, /* zero width joiner, U+200D NEW RFC 2070 */
  129. /* lrm */ {8206, 1063}, /* left-to-right mark, U+200E NEW RFC 2070 */
  130. /* rlm */ {8207, 1063}, /* right-to-left mark, U+200F NEW RFC 2070 */
  131. /* ndash */ {8211, 2045}, /* en dash, U+2013 ISOpub */
  132. /* mdash */ {8212, 2190}, /* em dash, U+2014 ISOpub */
  133. /* horbar */ {8213, 190}, /* U+2015 */
  134. /* Verbar */ {8214, 189}, /* U+2016 */
  135. /* lsquo */ {8216, 1096}, /* left single quotation mark, U+2018 ISOnum */
  136. /* rsquo */ {8217, 1039}, /* right single quotation mark, U+2019 ISOnum */
  137. /* sbquo */ {8218, 1044}, /* single low-9 quotation mark, U+201A NEW */
  138. /* ldquo */ {8220, 1096}, /* left double quotation mark, U+201C ISOnum */
  139. /* rdquo */ {8221, 1039}, /* right double quotation mark, U+201D ISOnum */
  140. /* bdquo */ {8222, 1044}, /* double low-9 quotation mark, U+201E NEW */
  141. /* dagger */ {8224, 2042}, /* dagger, U+2020 ISOpub */
  142. /* Dagger */ {8225, 2042}, /* double dagger, U+2021 ISOpub */
  143. /* bull */ {8226, 183}, /* bullet = black small circle, U+2022 ISOpub */
  144. /* hellip */ {8230, 188}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */
  145. /* lre */ {8234, 1063}, /* left-to-right embed, U+202A NEW RFC 2070 */
  146. /* rle */ {8235, 1063}, /* right-to-left embed, U+202B NEW RFC 2070 */
  147. /* pdf */ {8236, 1063}, /* pop directional format, U+202C NEW RFC 2070 */
  148. /* lro */ {8237, 1063}, /* left-to-right override, U+202D NEW RFC 2070 */
  149. /* rlo */ {8238, 1063}, /* right-to-left override, U+202E NEW RFC 2070 */
  150. /* permil */ {8240, 2037}, /* per mille sign, U+2030 ISOtech */
  151. /* prime */ {8242, 162}, /* prime = minutes = feet, U+2032 ISOtech */
  152. /* Prime */ {8243, 178}, /* double prime = seconds = inches, U+2033 ISOtech */
  153. /* lsaquo */ {8249, 1060}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */
  154. /* rsaquo */ {8250, 1062}, /* single right-pointing angle quotation mark, U+203A ISO proposed */
  155. /* oline */ {8254, 1175}, /* overline = spacing overscore, U+203E NEW */
  156. /* frasl */ {8260, 164}, /* fraction slash, U+2044 NEW */
  157. /*ApplyFunction*/ {8289, 1129}, /* thin space, U+2009 ISOpub */
  158. /*InvisibleTimes*/ {8290, 1129}, /* thin space, U+2009 ISOpub */
  159. /*InvisibleComa*/ {8291, 1129}, /* thin space, U+2009 ISOpub */
  160. /* euro */ {8364, 2206}, /* euro sign, U+20AC NEW */
  161. /*TripleDot */ {8411, 188}, /* tdot, U+20DB ISOtech */
  162. /* image */ {8465, 193}, /* blackletter capital I = imaginary part, U+2111 ISOamso */
  163. /* copysf */ {8471, 211}, /* U+2117 */
  164. /* weierp */ {8472, 195}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */
  165. /* real */ {8476, 194}, /* blackletter capital R = real part symbol, U+211C ISOamso */
  166. /* trade */ {8482, 212}, /* trade mark sign, U+2122 ISOnum */
  167. /* alefsym */ {8501, 192}, /* alef symbol = first transfinite cardinal, U+2135 NEW */
  168. /*DifferentialD*/{8518, 1100}, /* U+2146 */
  169. /*ExponentialE*/{8519, 1101},/* */
  170. /*ImaginaryI*/ {8520, 1105},/* */
  171. /* larr */ {8592, 172}, /* leftwards arrow, U+2190 ISOnum */
  172. /* uarr */ {8593, 173}, /* upwards arrow, U+2191 ISOnum*/
  173. /* rarr */ {8594, 174}, /* rightwards arrow, U+2192 ISOnum */
  174. /* darr */ {8595, 175}, /* downwards arrow, U+2193 ISOnum */
  175. /* harr */ {8596, 171}, /* left right arrow, U+2194 ISOamsa */
  176. /* crarr */ {8629, 191}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
  177. /* lrarr */ {8646, 171}, /* U+21C6 */
  178. /* lArr */ {8656, 220}, /* leftwards double arrow, U+21D0 ISOtech */
  179. /* uArr */ {8657, 221}, /* upwards double arrow, U+21D1 ISOamsa */
  180. /* rArr */ {8658, 222}, /* rightwards double arrow, U+21D2 ISOtech */
  181. /* dArr */ {8659, 223}, /* downwards double arrow, U+21D3 ISOamsa */
  182. /* hArr */ {8660, 219}, /* left right double arrow, U+21D4 ISOamsa */
  183. /* forall */ {8704, 34}, /* for all, U+2200 ISOtech */
  184. /* part */ {8706, 182}, /* partial differential, U+2202 ISOtech */
  185. /* exist */ {8707, 36}, /* there exists, U+2203 ISOtech */
  186. /* empty */ {8709, 198}, /* empty set = null set = diameter, U+2205 ISOamso */
  187. /* nabla */ {8711, 209}, /* nabla = backward difference, U+2207 ISOtech */
  188. /* isin */ {8712, 206}, /* element of, U+2208 ISOtech */
  189. /* notin */ {8713, 207}, /* not an element of, U+2209 ISOtech */
  190. /* ni */ {8715, 39}, /* contains as member, U+220B ISOtech */
  191. /* prod */ {8719, 213}, /* n-ary product = product sign, U+220F ISOamsb */
  192. /* sum */ {8721, 229}, /* n-ary sumation, U+2211 ISOamsb */
  193. /* minus */ {8722, 45}, /* minus sign, U+2212 ISOtech */
  194. /* Backslash*/ {8726, 1092},/* U+8726 */
  195. /* lowast */ {8727, 42}, /* asterisk operator, U+2217 ISOtech */
  196. /* radic */ {8730, 214}, /* square root = radical sign, U+221A ISOtech */
  197. /* prop */ {8733, 181}, /* proportional to, U+221D ISOtech */
  198. /* infin */ {8734, 165}, /* infinity, U+221E ISOtech */
  199. /* ang */ {8736, 208}, /* angle, U+2220 ISOamso */
  200. /* VerticalBar*/ {8739, 1124}, /* */
  201. /* parallel */ {8741, 1124}, /* parallel, U+2225 ISOtech */
  202. /* and */ {8743, 217}, /* logical and = wedge, U+2227 ISOtech */
  203. /* or */ {8744, 218}, /* logical or = vee, U+2228 ISOtech */
  204. /* cap */ {8745, 199}, /* intersection = cap, U+2229 ISOtech */
  205. /* cup */ {8746, 200}, /* union = cup, U+222A ISOtech */
  206. /* int */ {8747, 242}, /* integral, U+222B ISOtech */
  207. /* there4 */ {8756, 92}, /* therefore, U+2234 ISOtech */
  208. /* Colon */ {8759, 58}, /* Colon, U+2237 */
  209. /* sim */ {8764, 126}, /* tilde operator = varies with = similar to, U+223C ISOtech */
  210. /*EqualTilde*/ {8770, 64}, /* U+2242 ISOamsr */
  211. /* cong */ {8773, 64}, /* approximately equal to, U+2245 ISOtech */
  212. /* asymp */ {8776, 187}, /* almost equal to = asymptotic to, U+2248 ISOamsr */
  213. /* ne */ {8800, 185}, /* not equal to, U+2260 ISOtech */
  214. /* equiv */ {8801, 186}, /* identical to, U+2261 ISOtech */
  215. /* le */ {8804, 163}, /* less-than or equal to, U+2264 ISOtech */
  216. /* ge */ {8805, 179}, /* greater-than or equal to, U+2265 ISOtech */
  217. /* sub */ {8834, 204}, /* subset of, U+2282 ISOtech */
  218. /* sup */ {8835, 201}, /* superset of, U+2283 ISOtech */
  219. /* nsub */ {8836, 203}, /* not a subset of, U+2284 ISOamsn */
  220. /* sube */ {8838, 205}, /* subset of or equal to, U+2286 ISOtech */
  221. /* supe */ {8839, 202}, /* superset of or equal to, U+2287 ISOtech */
  222. /* subne */ {8842, 203}, /* U+228A */
  223. /* oplus */ {8853, 197}, /* circled plus = direct sum, U+2295 ISOamsb */
  224. /* otimes */ {8855, 196}, /* circled times = vector product, U+2297 ISOamsb */
  225. /* DownTee */ {8868, 94}, /* U+22A4 ISOtech */
  226. /* perp */ {8869, 94}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
  227. /* Vee */ {8897, 218}, /* U+22C1 ISOamsb */
  228. /*Intersection*/ {8898, 199}, /* U+22C2 ISOamsb */
  229. /*Intersection*/ {8899, 200}, /* U+22C3 ISOamsb */
  230. /* Diamond */ {8900, 168}, /* diamond operator, U+22C4 ISOamsb */
  231. /* sdot */ {8901, 215}, /* dot operator, U+22C5 ISOamsb */
  232. /* star */ {8902, 1042},/* */
  233. /* Subset */ {8912, 204}, /* U+22D0 */
  234. /* Cap */ {8914, 199}, /* U+22D2 */
  235. /* Cup */ {8915, 199}, /* U+22D3 */
  236. /* lceil */ {8968, 233}, /* left ceiling = apl upstile, U+2308 ISOamsc */
  237. /* rceil */ {8969, 249}, /* right ceiling, U+2309 ISOamsc */
  238. /* lfloor */ {8970, 235}, /* left floor = apl downstile, U+230A ISOamsc */
  239. /* rfloor */ {8971, 251}, /* right floor, U+230B ISOamsc */
  240. /* lang */ {9001, 225}, /* left-pointing angle bracket = bra, U+2329 ISOtech */
  241. /* rang */ {9002, 241}, /* right-pointing angle bracket = ket, U+232A ISOtech */
  242. /* dtri */ {9663, 209}, /* lozenge, U+25BF */
  243. /* loz */ {9674, 224}, /* lozenge, U+25CA ISOpub */
  244. /* spades */ {9824, 170}, /* black spade suit, U+2660 ISOpub */
  245. /* clubs */ {9827, 167}, /* black club suit = shamrock, U+2663 ISOpub */
  246. /* hearts */ {9829, 169}, /* black heart suit = valentine, U+2665 ISOpub */
  247. /* diams */ {9830, 168}, /* black diamond suit, U+2666 ISOpub */
  248. /* lang */ {0x27E8, 225},/* U+27E8 Mathematical left angle bracket */
  249. /* rang */ {0x27E9, 241},/* U+27E9 Mathematical right angle bracket */
  250. /* And */ {10835, 217}, /* U+2A53 */
  251. /* Or */ {10836, 218}, /* U+2A54 */
  252. /* Equal */ {10869, 1061},/* */
  253. /* Not */ {10988, 216}, /* U+2AEC */
  254. /* OverBrace*/ {65079, 132}, /* U+FE37 */
  255. /*UnderBrace*/ {65080, 133}, /* U+FE38 */
  256. /* THE END */ {0, 0} /* last entry (required) */
  257. };
  258. typedef struct _ElemToBeChecked *PtrElemToBeChecked;
  259. typedef struct _ElemToBeChecked
  260. {
  261. Element Elem; /* the element to be checked */
  262. PtrElemToBeChecked nextElemToBeChecked;
  263. }
  264. ElemToBeChecked;
  265. /* empty elements */
  266. static int EmptyElement[] =
  267. {
  268. HTML_EL_AREA,
  269. HTML_EL_BASE,
  270. HTML_EL_BaseFont,
  271. HTML_EL_BR,
  272. HTML_EL_COL,
  273. HTML_EL_FRAME,
  274. HTML_EL_Horizontal_Rule,
  275. HTML_EL_IMG,
  276. HTML_EL_Input,
  277. HTML_EL_ISINDEX,
  278. HTML_EL_LINK,
  279. HTML_EL_META,
  280. HTML_EL_Parameter,
  281. HTML_EL_PICTURE_UNIT,
  282. 0};
  283. /* character level elements */
  284. static int CharLevelElement[] =
  285. {
  286. HTML_EL_TEXT_UNIT, HTML_EL_PICTURE_UNIT, HTML_EL_SYMBOL_UNIT,
  287. HTML_EL_Anchor,
  288. HTML_EL_Teletype_text, HTML_EL_Italic_text, HTML_EL_Bold_text,
  289. HTML_EL_Underlined_text, HTML_EL_Struck_text, HTML_EL_Big_text,
  290. HTML_EL_Small_text,
  291. HTML_EL_Emphasis, HTML_EL_Strong, HTML_EL_Def, HTML_EL_Code, HTML_EL_Sample,
  292. HTML_EL_Keyboard, HTML_EL_Variable_, HTML_EL_Cite, HTML_EL_ABBR,
  293. HTML_EL_ACRONYM,
  294. HTML_EL_Font_, HTML_EL_Quotation, HTML_EL_Subscript, HTML_EL_Superscript,
  295. HTML_EL_Span, HTML_EL_BDO, HTML_EL_ins, HTML_EL_del,
  296. HTML_EL_IMG, HTML_EL_Input,
  297. HTML_EL_Option, HTML_EL_OptGroup, HTML_EL_Option_Menu,
  298. HTML_EL_Text_Input, HTML_EL_Password_Input, HTML_EL_File_Input,
  299. HTML_EL_Checkbox_Input, HTML_EL_Radio_Input, HTML_EL_Submit_Input,
  300. HTML_EL_Reset_Input, HTML_EL_Hidden_Input, HTML_EL_Inserted_Text,
  301. HTML_EL_Button_Input, HTML_EL_BUTTON_,
  302. HTML_EL_LABEL,
  303. HTML_EL_BR, HTML_EL_ruby,
  304. HTML_EL_Object, HTML_EL_Basic_Elem,
  305. 0};
  306. /* block level elements, i.e. elements having a Line rule in the presentation
  307. schema fo the main view */
  308. static int BlockLevelElement[] =
  309. {
  310. HTML_EL_H1, HTML_EL_H2, HTML_EL_H3, HTML_EL_H4, HTML_EL_H5, HTML_EL_H6,
  311. HTML_EL_Paragraph, HTML_EL_Pseudo_paragraph, HTML_EL_Text_Area,
  312. HTML_EL_Term, HTML_EL_Address, HTML_EL_LEGEND, HTML_EL_CAPTION,
  313. HTML_EL_INS, HTML_EL_DEL, HTML_EL_Division,
  314. 0};
  315. /* start tags that imply the end of a current element */
  316. /* any tag of each line implies the end of the current element if the type of
  317. that element is in the same line */
  318. typedef char oneLine[100];
  319. static oneLine EquivEndingElem[] =
  320. {
  321. "dt dd li option",
  322. "h1 h2 h3 h4 h5 h6",
  323. "address pre listing xmp",
  324. ""
  325. };
  326. /* acording the HTML DTD, HR should be added to the 2nd line above, as it */
  327. /* is not allowed within a H1, H2, H3, etc. But we should tolerate that case */
  328. /* because many documents contain rules in headings... */
  329. /* start tags that imply the end of current element */
  330. static oneLine StartTagEndingElem[] =
  331. {
  332. "form closes form p p* hr h1 h2 h3 h4 h5 h6 dl ul ol menu dir address pre listing xmp head",
  333. "head closes p p*",
  334. "title closes p p*",
  335. "body closes head style script title p p*",
  336. "li closes p p* h1 h2 h3 h4 h5 h6 dl address pre listing xmp head",
  337. "hr closes p p* head",
  338. "h1 closes p p* head",
  339. "h2 closes p p* head",
  340. "h3 closes p p* head",
  341. "h4 closes p p* head",
  342. "h5 closes p p* head",
  343. "h6 closes p p* head",
  344. "dir closes p p* head",
  345. "address closes p p* head ul",
  346. "pre closes p p* head ul",
  347. "listing closes p p* head",
  348. "xmp closes p p* head",
  349. "blockquote closes p p* head",
  350. "dl closes p p* dt menu dir address pre listing xmp head",
  351. "dt closes p p* menu dir address pre listing xmp head",
  352. "dd closes p p* menu dir address pre listing xmp head",
  353. "ul closes p p* head ol menu dir address pre listing xmp",
  354. "ol closes p p* head ul",
  355. "menu closes p p* head ul",
  356. "p closes p p* head h1 h2 h3 h4 h5 h6",
  357. "p* closes p p* head",
  358. "div closes p p* head",
  359. "noscript closes p p* head",
  360. "center closes font b i p p* head",
  361. "a closes a",
  362. "caption closes p p*",
  363. "colgroup closes caption colgroup col p p*",
  364. "col closes caption col p p*",
  365. "table closes p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
  366. "th closes th td",
  367. "td closes th td",
  368. "tr closes th td tr caption col colgroup",
  369. "thead closes caption col colgroup",
  370. "tfoot closes th td tr caption col colgroup thead tbody",
  371. "tbody closes th td tr caption col colgroup thead tfoot tbody",
  372. "optgroup closes option",
  373. "fieldset closes legend p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
  374. ""
  375. };
  376. typedef int State; /* a state of the automaton */
  377. extern int HTML_ENTRIES;
  378. static PtrClosedElement *FirstClosedElem;
  379. /* ---------------------- static variables ---------------------- */
  380. /* parser stack */
  381. #define MaxStack 200 /* maximum stack height */
  382. static int GINumberStack[MaxStack]; /* entry of pHTMLGIMapping */
  383. static Element ElementStack[MaxStack]; /* element in the Thot abstract
  384. tree */
  385. static int ThotLevel[MaxStack]; /* level of element in the Thot
  386. tree */
  387. static Language LanguageStack[MaxStack]; /* element language */
  388. static int StackLevel = 0; /* first free element on the
  389. stack */
  390. /* information about the input file */
  391. #define INPUT_FILE_BUFFER_SIZE 2000
  392. #define PREV_READ_CHARS 30
  393. static char FileBuffer[INPUT_FILE_BUFFER_SIZE+1];
  394. static char PreviousRead[PREV_READ_CHARS+1];
  395. static char *WorkBuffer = FileBuffer;
  396. static int LastCharInWorkBuffer = 0; /* last char. in the buffer */
  397. static int LastCharInPreviousRead = 0;
  398. static int CurrentBufChar = 0; /* current character read */
  399. static int StartOfTagIndx = 0; /* last "<" read */
  400. static int StartOfRead = 0;
  401. static char PreviousBufChar = EOS; /* previous character read */
  402. static char *InputText = NULL;
  403. static gzFile stream = 0;
  404. static int NumberOfLinesRead = 0;/* number of lines read in the
  405. file */
  406. static int NumberOfCharRead = 0; /* number of characters read in the
  407. current line */
  408. static ThotBool EmptyLine = TRUE; /* no printable character encountered
  409. yet in the current line */
  410. static ThotBool StartOfFile = TRUE; /* no printable character encountered
  411. yet in the file */
  412. static ThotBool AfterTagPRE = FALSE; /* <PRE> has just been read */
  413. static char* docURL = NULL; /* path or URL of the document */
  414. static char *docURL2 = NULL; /* save the docURL for some cases of parsing errors */
  415. /* Static variables used for the call to the XML parser */
  416. static ThotBool NotToReadFile = FALSE;
  417. static int PreviousNumberOfLinesRead = 0;
  418. static int PreviousNumberOfCharRead = 0;
  419. /* Boolean that indicates the end of a HTML file */
  420. /* It is a static variable because it is used in parameter */
  421. /* for the call of the new XML parser (EndOfStartGI) */
  422. static ThotBool EndOfHtmlFile;
  423. /* input buffer */
  424. #define MaxBufferLength 1000
  425. #define AllmostFullBuffer 700
  426. #define MaxMsgLength 300 /* maximum size of error messages */
  427. static unsigned char inputBuffer[MaxBufferLength];
  428. static int LgBuffer = 0; /* actual length of text in input
  429. buffer */
  430. static int BufferLineNumber = 0; /* line number in the source file of
  431. the beginning of the text
  432. contained in the buffer */
  433. /* information about the Thot document under construction */
  434. /* global data used by the HTML parser */
  435. static ParserData HTMLcontext = {0, ISO_8859_1, 0, NULL, 0,
  436. FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
  437. static SSchema DocumentSSchema = NULL; /* the HTML structure schema */
  438. static Element rootElement = NULL; /* root element of the document */
  439. static int lastElemEntry = 0; /* index in the pHTMLGIMapping of the
  440. element being created */
  441. static Attribute lastAttribute = NULL; /* last attribute created */
  442. static Attribute lastAttrElement = NULL;/* element with which the last
  443. attribute has been associated */
  444. static AttributeMapping *lastAttrEntry = NULL; /* entry in the AttributeMappingTable
  445. of the attribute being created */
  446. static ThotBool UnknownAttr = FALSE; /* the last attribute encountered is
  447. invalid */
  448. static ThotBool ReadingAnAttrValue = FALSE;
  449. static ThotBool TruncatedAttrValue = FALSE;
  450. static char *BufferAttrValue = NULL;
  451. static int LgBufferAttrValue = 0;
  452. static Element CommentText = NULL; /* TEXT element of the current
  453. Comment element */
  454. static Element ASPText = NULL; /* TEXT element of the current
  455. ASP element */
  456. static Element PIText = NULL; /* TEXT element of the current
  457. ASP element */
  458. static ThotBool UnknownTag = FALSE; /* the last start tag encountered is
  459. invalid */
  460. static ThotBool HTMLrootClosed = FALSE;
  461. static char *HTMLrootClosingTag = NULL;
  462. static PtrElemToBeChecked FirstElemToBeChecked = NULL;
  463. static PtrElemToBeChecked LastElemToBeChecked = NULL;
  464. /* automaton */
  465. static State currentState; /* current state of the automaton */
  466. static State returnState; /* return state from subautomaton */
  467. static ThotBool NormalTransition;
  468. static ThotBool CharProcessed;
  469. /* information about an entity being read */
  470. static char EntityName[MaxEntityLength];/* name of entity being read */
  471. static int LgEntityName = 0; /* length of entity name read so
  472. far */
  473. static int EntityTableEntry = 0; /* entry of the entity table that
  474. matches the entity read so far */
  475. static int CharRank = 0; /* rank of the last matching
  476. character in that entry */
  477. /* second char of an UTF-8 string */
  478. static unsigned char SecondByte[6] = {EOS, EOS, EOS, EOS, EOS, EOS};
  479. static void ProcessStartGI (const char* GIname);
  480. static void EndOfAttrValue (char c);
  481. /*----------------------------------------------------------------------
  482. ----------------------------------------------------------------------*/
  483. static const char *StrCaseStr (const char *str1, const char *str2)
  484. {
  485. char c;
  486. const char *ptr;
  487. int len;
  488. if (str1 == NULL || str2 == NULL)
  489. return NULL;
  490. c = *str2;
  491. len = strlen ((char *)str2);
  492. ptr = str1;
  493. while (*ptr != EOS)
  494. {
  495. if (tolower(*ptr) == c && !strncasecmp ((char *)str2, ptr, len))
  496. return ptr;
  497. else
  498. ptr++;
  499. }
  500. return NULL;
  501. }
  502. /*----------------------------------------------------------------------
  503. ----------------------------------------------------------------------*/
  504. char *SkipSep (char *ptr)
  505. {
  506. while (*ptr == SPACE || *ptr == ',')
  507. ptr++;
  508. return (ptr);
  509. }
  510. /*----------------------------------------------------------------------
  511. ----------------------------------------------------------------------*/
  512. char *SkipInt (char *ptr)
  513. {
  514. while (*ptr != EOS && *ptr != SPACE && *ptr != ',')
  515. ptr++;
  516. return (ptr);
  517. }
  518. /*----------------------------------------------------------------------
  519. ParseAreaCoords
  520. Computes x, y, width and height of the box from the coords attribute value.
  521. ----------------------------------------------------------------------*/
  522. void ParseAreaCoords (Element element, Document document)
  523. {
  524. ElementType elType;
  525. AttributeType attrType;
  526. Attribute attrCoords, attrX, attrY;
  527. Attribute attrW, attrH, attrShape;
  528. char *ptr3, *text;
  529. int x1, y1, x2, y2;
  530. int length, shape, r;
  531. /* Is it an AREA element */
  532. elType = TtaGetElementType (element);
  533. if (elType.ElTypeNum != HTML_EL_AREA)
  534. return;
  535. /* Search the coords attribute */
  536. attrType.AttrSSchema = elType.ElSSchema;
  537. attrType.AttrTypeNum = HTML_ATTR_coords;
  538. attrCoords = TtaGetAttribute (element, attrType);
  539. if (attrCoords == NULL)
  540. return;
  541. /* Search the shape attribute */
  542. attrType.AttrTypeNum = HTML_ATTR_shape;
  543. attrShape = TtaGetAttribute (element, attrType);
  544. if (attrShape == NULL)
  545. /* no shape attribute. Create one with value rectangle */
  546. {
  547. attrShape = TtaNewAttribute (attrType);
  548. TtaAttachAttribute (element, attrShape, document);
  549. shape = HTML_ATTR_shape_VAL_rectangle;
  550. TtaSetAttributeValue (attrShape, shape, element, document);
  551. }
  552. else
  553. shape = TtaGetAttributeValue (attrShape);
  554. length = TtaGetTextAttributeLength (attrCoords);
  555. text = (char*)TtaGetMemory (length + 1);
  556. TtaGiveTextAttributeValue (attrCoords, text, &length);
  557. if (shape == HTML_ATTR_shape_VAL_rectangle ||
  558. shape == HTML_ATTR_shape_VAL_circle)
  559. {
  560. /* Search the x_coord attribute */
  561. attrType.AttrTypeNum = HTML_ATTR_x_coord;
  562. attrX = TtaGetAttribute (element, attrType);
  563. if (attrX == NULL)
  564. {
  565. /* create it */
  566. attrX = TtaNewAttribute (attrType);
  567. TtaAttachAttribute (element, attrX, document);
  568. }
  569. /* Search the y_coord attribute */
  570. attrType.AttrTypeNum = HTML_ATTR_y_coord;
  571. attrY = TtaGetAttribute (element, attrType);
  572. if (attrY == NULL)
  573. {
  574. /* create it */
  575. attrY = TtaNewAttribute (attrType);
  576. TtaAttachAttribute (element, attrY, document);
  577. }
  578. /* Search the width attribute */
  579. attrType.AttrTypeNum = HTML_ATTR_IntWidthPxl;
  580. attrW = TtaGetAttribute (element, attrType);
  581. if (attrW == NULL)
  582. {
  583. /* create it */
  584. attrW = TtaNewAttribute (attrType);
  585. TtaAttachAttribute (element, attrW, document);
  586. }
  587. /* Search the height attribute */
  588. attrType.AttrTypeNum = HTML_ATTR_IntHeightPxl;
  589. attrH = TtaGetAttribute (element, attrType);
  590. if (attrH == NULL)
  591. {
  592. /* create it */
  593. attrH = TtaNewAttribute (attrType);
  594. TtaAttachAttribute (element, attrH, document);
  595. }
  596. if (shape == HTML_ATTR_shape_VAL_rectangle)
  597. {
  598. x1 = x2 = y1 = y2 = 0;
  599. ptr3 = text;
  600. if (ptr3)
  601. sscanf (ptr3, "%d", &x1);
  602. ptr3 = SkipInt (ptr3);
  603. ptr3 = SkipSep (ptr3);
  604. if (ptr3)
  605. sscanf (ptr3, "%d", &y1);
  606. ptr3 = SkipInt (ptr3);
  607. ptr3 = SkipSep (ptr3);
  608. if (ptr3)
  609. sscanf (ptr3, "%d", &x2);
  610. ptr3 = SkipInt (ptr3);
  611. ptr3 = SkipSep (ptr3);
  612. sscanf (ptr3, "%d", &y2);
  613. TtaSetAttributeValue (attrX, x1, element, document);
  614. TtaSetAttributeValue (attrY, y1, element, document);
  615. TtaSetAttributeValue (attrW, x2 - x1, element, document);
  616. TtaSetAttributeValue (attrH, y2 - y1, element, document);
  617. }
  618. else
  619. {
  620. x1 = y1 = r = 0;
  621. ptr3 = text;
  622. if (ptr3)
  623. sscanf (ptr3, "%d", &x1);
  624. ptr3 = SkipInt (ptr3);
  625. ptr3 = SkipSep (ptr3);
  626. if (ptr3)
  627. sscanf (ptr3, "%d", &y1);
  628. ptr3 = SkipInt (ptr3);
  629. ptr3 = SkipSep (ptr3);
  630. if (ptr3)
  631. sscanf (ptr3, "%d", &r);
  632. TtaSetAttributeValue (attrX, x1 - r, element, document);
  633. TtaSetAttributeValue (attrY, y1 - r, element, document);
  634. TtaSetAttributeValue (attrW, 2 * r, element, document);
  635. TtaSetAttributeValue (attrH, 2 * r, element, document);
  636. }
  637. }
  638. else if (shape == HTML_ATTR_shape_VAL_polygon)
  639. {
  640. element = TtaGetFirstChild (element);
  641. length = TtaGetPolylineLength (element);
  642. /* remove previous points */
  643. while (length > 1)
  644. {
  645. TtaDeletePointInPolyline (element, length, document);
  646. length--;
  647. }
  648. length = 1;
  649. ptr3 = text;
  650. /* add new points */
  651. while (*ptr3 != EOS)
  652. {
  653. x1 = y1 = 0;
  654. sscanf (ptr3, "%d", &x1);
  655. ptr3 = SkipInt (ptr3);
  656. ptr3 = SkipSep (ptr3);
  657. if (ptr3)
  658. sscanf (ptr3, "%d", &y1);
  659. ptr3 = SkipInt (ptr3);
  660. ptr3 = SkipSep (ptr3);
  661. TtaAddPointInPolyline (element, length, UnPixel, x1, y1,document,
  662. FALSE);
  663. length++;
  664. }
  665. }
  666. TtaFreeMemory (text);
  667. }
  668. /*----------------------------------------------------------------------
  669. SetLanguagInHTMLStack
  670. Sets the value of the language.
  671. ----------------------------------------------------------------------*/
  672. void SetLanguagInHTMLStack (Language lang)
  673. {
  674. LanguageStack[StackLevel - 1] = lang;
  675. }
  676. /*----------------------------------------------------------------------
  677. IsHtmlParsingCSS
  678. Returns the value of ParsingCSS boolean.
  679. ----------------------------------------------------------------------*/
  680. ThotBool IsHtmlParsingCSS ()
  681. {
  682. return HTMLcontext.parsingCSS;
  683. }
  684. /*----------------------------------------------------------------------
  685. SetHtmlParsingCSS
  686. Sets the value of ParsingCSS boolean.
  687. ----------------------------------------------------------------------*/
  688. void SetHtmlParsingCSS (ThotBool value)
  689. {
  690. HTMLcontext.parsingCSS = value;
  691. }
  692. /*----------------------------------------------------------------------
  693. SetHtmlParsingTextArea
  694. Sets the value of ParsingTextArea boolean.
  695. ----------------------------------------------------------------------*/
  696. void SetHtmlParsingTextArea (ThotBool value)
  697. {
  698. HTMLcontext.parsingTextArea = value;
  699. }
  700. /*----------------------------------------------------------------------
  701. SetHtmlParsingScript
  702. Sets the value of ParsingScript boolean.
  703. ----------------------------------------------------------------------*/
  704. void SetHtmlParsingScript (ThotBool value)
  705. {
  706. HTMLcontext.parsingScript = value;
  707. }
  708. /*----------------------------------------------------------------------
  709. SetHtmlElemLineNumber
  710. Assigns the current line number
  711. ----------------------------------------------------------------------*/
  712. void SetHtmlElemLineNumber (Element el)
  713. {
  714. TtaSetElementLineNumber (el, NumberOfLinesRead);
  715. }
  716. /*----------------------------------------------------------------------
  717. IsWithinHtmlTable
  718. Returns the value of WithinTable integer.
  719. ----------------------------------------------------------------------*/
  720. int IsWithinHtmlTable ()
  721. {
  722. return HTMLcontext.withinTable;
  723. }
  724. /*----------------------------------------------------------------------
  725. copyCEstring create a copy of the string of elements pointed
  726. by first and return a pointer on the first
  727. element of the copy.
  728. ----------------------------------------------------------------------*/
  729. static PtrClosedElement copyCEstring (PtrClosedElement first)
  730. {
  731. PtrClosedElement ret, cur, next, prev;
  732. ret = NULL;
  733. cur = first;
  734. prev = NULL;
  735. while (cur != NULL)
  736. {
  737. next = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  738. next->nextClosedElem = NULL;
  739. next->tagNum = cur->tagNum;
  740. if (ret == NULL)
  741. ret = next;
  742. else
  743. prev->nextClosedElem = next;
  744. prev = next;
  745. cur = cur->nextClosedElem;
  746. }
  747. return ret;
  748. }
  749. /*----------------------------------------------------------------------
  750. InitMapping intialise the list of the elements closed by
  751. each start tag.
  752. ----------------------------------------------------------------------*/
  753. void InitMapping (void)
  754. {
  755. int line;
  756. int entry;
  757. int ptr;
  758. int i;
  759. typeName name;
  760. PtrClosedElement newCE, lastCE, firstCE, curCE;
  761. SSchema schema;
  762. /* building the table */
  763. FirstClosedElem = (PtrClosedElement *)TtaGetMemory (HTML_ENTRIES * sizeof(PtrClosedElement));
  764. for (entry = 0; entry < HTML_ENTRIES; entry++)
  765. FirstClosedElem[entry] = NULL;
  766. /* read table EquivEndingElem */
  767. line = 0;
  768. do
  769. /* read one line of EquivEndingElem */
  770. {
  771. ptr = 0;
  772. lastCE = NULL;
  773. firstCE = NULL;
  774. do
  775. {
  776. /* read one identifier */
  777. i = 0;
  778. while (EquivEndingElem[line][ptr] != SPACE &&
  779. EquivEndingElem[line][ptr] != EOS)
  780. name[i++] = EquivEndingElem[line][ptr++];
  781. name[i] = EOS;
  782. ptr++;
  783. if (i > 0)
  784. /* a identifier has been read */
  785. {
  786. schema = DocumentSSchema;
  787. entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
  788. #ifdef DEBUG
  789. if (entry < 0)
  790. fprintf (stderr, "error in EquivEndingElem: tag %s unknown in line\n%s\n", name, EquivEndingElem[line]);
  791. else
  792. #endif
  793. {
  794. newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  795. newCE->nextClosedElem = NULL;
  796. newCE->tagNum = entry;
  797. if (firstCE == NULL)
  798. firstCE = newCE;
  799. else
  800. lastCE->nextClosedElem = newCE;
  801. lastCE = newCE;
  802. }
  803. }
  804. }
  805. while (EquivEndingElem[line][ptr] != EOS);
  806. /* one line has been read */
  807. curCE = firstCE;
  808. while (curCE != NULL)
  809. {
  810. if (curCE->nextClosedElem == NULL)
  811. newCE = firstCE;
  812. else
  813. newCE = copyCEstring (firstCE);
  814. if (FirstClosedElem[curCE->tagNum] == NULL)
  815. FirstClosedElem[curCE->tagNum] = newCE;
  816. else
  817. {
  818. lastCE = FirstClosedElem[curCE->tagNum];
  819. while (lastCE->nextClosedElem != NULL)
  820. lastCE = lastCE->nextClosedElem;
  821. lastCE->nextClosedElem = newCE;
  822. }
  823. curCE = curCE->nextClosedElem;
  824. }
  825. line++;
  826. }
  827. while (strcmp (EquivEndingElem[line], "") != 0);
  828. /* read table StartTagEndingElem */
  829. line = 0;
  830. do
  831. /* read one line of StartTagEndingElem */
  832. {
  833. ptr = 0;
  834. i = 0;
  835. /* read the first tag name of the line */
  836. while (StartTagEndingElem[line][ptr] != SPACE &&
  837. StartTagEndingElem[line][ptr] != EOS)
  838. name[i++] = StartTagEndingElem[line][ptr++];
  839. name[i] = EOS;
  840. i = 0;
  841. ptr++;
  842. schema = DocumentSSchema;
  843. entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
  844. #ifdef DEBUG
  845. if (entry < 0)
  846. fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
  847. #endif
  848. /* read the keyword "closes" */
  849. while (StartTagEndingElem[line][ptr] != SPACE &&
  850. StartTagEndingElem[line][ptr] != EOS)
  851. name[i++] = StartTagEndingElem[line][ptr++];
  852. name[i] = EOS;
  853. i = 0;
  854. ptr++;
  855. #ifdef DEBUG
  856. if (strcmp (name, "closes") != 0)
  857. fprintf (stderr, "error in StartTagEndingElem: \"%s\" instead of \"closes\" in line\n%s\n", name, StartTagEndingElem[line]);
  858. #endif
  859. lastCE = FirstClosedElem[entry];
  860. if (lastCE != NULL)
  861. while (lastCE->nextClosedElem != NULL)
  862. lastCE = lastCE->nextClosedElem;
  863. do
  864. {
  865. while (StartTagEndingElem[line][ptr] != SPACE &&
  866. StartTagEndingElem[line][ptr] != EOS)
  867. name[i++] = StartTagEndingElem[line][ptr++];
  868. name[i] = EOS;
  869. ptr++;
  870. if (i > 0)
  871. {
  872. i = 0;
  873. newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  874. newCE->nextClosedElem = NULL;
  875. schema = DocumentSSchema;
  876. newCE->tagNum = MapGI ((char *)name, &schema, HTMLcontext.doc);
  877. #ifdef DEBUG
  878. if (newCE->tagNum < 0)
  879. fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
  880. #endif
  881. if (lastCE == NULL)
  882. FirstClosedElem[entry] = newCE;
  883. else
  884. lastCE->nextClosedElem = newCE;
  885. lastCE = newCE;
  886. }
  887. }
  888. while (StartTagEndingElem[line][ptr] != EOS);
  889. line++;
  890. }
  891. while (strcmp (StartTagEndingElem[line], "") != 0);
  892. }
  893. /*----------------------------------------------------------------------
  894. Within checks if an element of type ThotType is in the stack.
  895. ----------------------------------------------------------------------*/
  896. static ThotBool Within (int ThotType, SSchema ThotSSchema)
  897. {
  898. ThotBool ret;
  899. int i;
  900. ElementType elType;
  901. ret = FALSE;
  902. i = StackLevel - 1;
  903. while (i >= 0 && !ret)
  904. {
  905. if (ElementStack[i] != NULL)
  906. {
  907. elType = TtaGetElementType (ElementStack[i]);
  908. if (elType.ElTypeNum == ThotType &&
  909. elType.ElSSchema == ThotSSchema)
  910. ret = TRUE;
  911. }
  912. i--;
  913. }
  914. return ret;
  915. }
  916. /*----------------------------------------------------------------------
  917. HTMLParseError print the error message msg on stderr.
  918. If lineNumber = 0, print the current line number in the source file,
  919. otherwise print the line number provided.
  920. ----------------------------------------------------------------------*/
  921. void HTMLParseError (Document doc, const char* msg, int lineNumber)
  922. {
  923. if (IgnoreErrors)
  924. return;
  925. HTMLErrorsFound = TRUE;
  926. if (!ErrFile)
  927. if (OpenParsingErrors (doc) == FALSE)
  928. return;
  929. if (doc == HTMLcontext.doc)
  930. {
  931. /* the error message is related to the document being parsed */
  932. if (docURL != NULL)
  933. {
  934. if (!XMLErrorsFound)
  935. fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL);
  936. TtaFreeMemory (docURL);
  937. docURL = NULL;
  938. }
  939. else
  940. {
  941. if (CSSErrorsFound && docURL2)
  942. {
  943. fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL2);
  944. TtaFreeMemory (docURL2);
  945. docURL2 = NULL;
  946. }
  947. }
  948. if (lineNumber <= 0)
  949. /* print the line number and character number before the message */
  950. fprintf (ErrFile, "@ line %d, char %d: %s\n", NumberOfLinesRead,
  951. NumberOfCharRead, msg);
  952. else
  953. fprintf (ErrFile, "@ line %d, char 0: %s\n", lineNumber, msg);
  954. }
  955. else
  956. /* print only the error message */
  957. fprintf (ErrFile, "%s\n", msg);
  958. }
  959. /*----------------------------------------------------------------------
  960. CloseBuffer close the input buffer.
  961. ----------------------------------------------------------------------*/
  962. static void CloseBuffer ()
  963. {
  964. inputBuffer[LgBuffer] = EOS;
  965. }
  966. /*----------------------------------------------------------------------
  967. InitBuffer initialize the input buffer.
  968. ----------------------------------------------------------------------*/
  969. static void InitBuffer ()
  970. {
  971. LgBuffer = 0;
  972. }
  973. static ThotBool InsertElement (Element * el);
  974. /*----------------------------------------------------------------------
  975. InsertSibling return TRUE if the new element must be inserted
  976. in the Thot document as a sibling of lastElement;
  977. return FALSE it it must be inserted as a child.
  978. ----------------------------------------------------------------------*/
  979. static ThotBool InsertSibling ()
  980. {
  981. if (StackLevel == 0)
  982. return FALSE;
  983. else if (HTMLcontext.lastElementClosed ||
  984. TtaIsLeaf (TtaGetElementType (HTMLcontext.lastElement)) ||
  985. (GINumberStack[StackLevel - 1] >= 0 &&
  986. pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLcontents == 'E'))
  987. return TRUE;
  988. else
  989. return FALSE;
  990. }
  991. /*----------------------------------------------------------------------
  992. IsEmptyElement return TRUE if element el is defined as an empty element.
  993. ----------------------------------------------------------------------*/
  994. static ThotBool IsEmptyElement (Element el)
  995. {
  996. ElementType elType;
  997. int i;
  998. ThotBool ret;
  999. ret = FALSE;
  1000. elType = TtaGetElementType (el);
  1001. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1002. return ret;
  1003. i = 0;
  1004. while (EmptyElement[i] > 0 && EmptyElement[i] != elType.ElTypeNum)
  1005. i++;
  1006. if (EmptyElement[i] == elType.ElTypeNum)
  1007. ret = TRUE;
  1008. return ret;
  1009. }
  1010. /*----------------------------------------------------------------------
  1011. IsCharacterLevelType return TRUE if element type is a
  1012. character level element, FALSE if not.
  1013. ----------------------------------------------------------------------*/
  1014. ThotBool IsCharacterLevelType (ElementType elType)
  1015. {
  1016. int i;
  1017. ThotBool ret;
  1018. ret = FALSE;
  1019. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1020. return ret;
  1021. i = 0;
  1022. while (CharLevelElement[i] > 0 &&
  1023. CharLevelElement[i] != elType.ElTypeNum)
  1024. i++;
  1025. if (CharLevelElement[i] == elType.ElTypeNum)
  1026. ret = TRUE;
  1027. return ret;
  1028. }
  1029. /*----------------------------------------------------------------------
  1030. IsCharacterLevelElement return TRUE if element el is a
  1031. character level element, FALSE if not.
  1032. ----------------------------------------------------------------------*/
  1033. ThotBool IsCharacterLevelElement (Element el)
  1034. {
  1035. ElementType elType;
  1036. elType = TtaGetElementType (el);
  1037. return IsCharacterLevelType (elType);
  1038. }
  1039. /*----------------------------------------------------------------------
  1040. IsBlockElementType return TRUE if element type is a block element.
  1041. Same as IsBlockElement but just with the element type.
  1042. ----------------------------------------------------------------------*/
  1043. ThotBool IsBlockElementType (ElementType elType)
  1044. {
  1045. int i;
  1046. ThotBool ret;
  1047. ret = FALSE;
  1048. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1049. return ret;
  1050. i = 0;
  1051. while (BlockLevelElement[i] > 0 &&
  1052. BlockLevelElement[i] != elType.ElTypeNum)
  1053. i++;
  1054. if (BlockLevelElement[i] == elType.ElTypeNum)
  1055. ret = TRUE;
  1056. return ret;
  1057. }
  1058. /*----------------------------------------------------------------------
  1059. IsBlockElement return TRUE if element el is a block element.
  1060. ----------------------------------------------------------------------*/
  1061. ThotBool IsBlockElement (Element el)
  1062. {
  1063. ElementType elType;
  1064. elType = TtaGetElementType (el);
  1065. return IsBlockElementType (elType);
  1066. }
  1067. /*----------------------------------------------------------------------
  1068. TextToDocument Put the content of input buffer in the document.
  1069. ----------------------------------------------------------------------*/
  1070. static void TextToDocument ()
  1071. {
  1072. ElementType elType;
  1073. Element elText, parent;
  1074. int i;
  1075. ThotBool ignoreLeadingSpaces;
  1076. ThotBool insSibling, ok;
  1077. CloseBuffer ();
  1078. if (HTMLcontext.lastElement)
  1079. {
  1080. i = 0;
  1081. insSibling = InsertSibling ();
  1082. ignoreLeadingSpaces = IsLeadingSpaceUseless (HTMLcontext.lastElement,
  1083. HTMLcontext.doc, insSibling, FALSE);
  1084. if (ignoreLeadingSpaces &&
  1085. !Within (HTML_EL_Preformatted, DocumentSSchema) &&
  1086. !Within (HTML_EL_STYLE_, DocumentSSchema) &&
  1087. !Within (HTML_EL_SCRIPT_, DocumentSSchema))
  1088. /* suppress leading spaces */
  1089. while (inputBuffer[i] <= SPACE && inputBuffer[i] != EOS)
  1090. i++;
  1091. if (inputBuffer[i] != EOS)
  1092. {
  1093. elType = TtaGetElementType (HTMLcontext.lastElement);
  1094. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT && HTMLcontext.mergeText)
  1095. TtaAppendTextContent (HTMLcontext.lastElement, (unsigned char *)&(inputBuffer[i]),
  1096. HTMLcontext.doc);
  1097. else
  1098. {
  1099. if (inputBuffer[i] == SPACE && LgBuffer == 1)
  1100. {
  1101. // avoid to generate an empty pseudo paragraph
  1102. ok = FALSE;
  1103. if (InsertSibling ())
  1104. parent = TtaGetParent (HTMLcontext.lastElement);
  1105. else
  1106. parent = HTMLcontext.lastElement;
  1107. if (parent)
  1108. {
  1109. elType = TtaGetElementType (parent);
  1110. if (IsCharacterLevelElement (parent) ||
  1111. !XhtmlCannotContainText (elType))
  1112. ok = TRUE; // generate the TEXT element
  1113. }
  1114. }
  1115. else
  1116. ok = TRUE;
  1117. if (ok)
  1118. {
  1119. /* create a TEXT element */
  1120. elType.ElSSchema = DocumentSSchema;
  1121. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  1122. elText = TtaNewElement (HTMLcontext.doc, elType);
  1123. TtaSetElementLineNumber (elText, BufferLineNumber);
  1124. InsertElement (&elText);
  1125. HTMLcontext.lastElementClosed = TRUE;
  1126. HTMLcontext.mergeText = TRUE;
  1127. /* put the content of the input buffer into the TEXT element */
  1128. if (elText)
  1129. TtaSetTextContent (elText, (unsigned char *)&(inputBuffer[i]),
  1130. HTMLcontext.language, HTMLcontext.doc);
  1131. }
  1132. }
  1133. }
  1134. }
  1135. InitBuffer ();
  1136. }
  1137. /*----------------------------------------------------------------------
  1138. StartOfTag Beginning of a HTML tag (start or end tag).
  1139. Put the preceding text into the Thot document.
  1140. ----------------------------------------------------------------------*/
  1141. static void StartOfTag (char c)
  1142. {
  1143. if (LgBuffer > 0)
  1144. TextToDocument ();
  1145. HTMLcontext.mergeText = FALSE;
  1146. StartOfTagIndx = CurrentBufChar - 1;
  1147. PreviousNumberOfCharRead = NumberOfCharRead - 1;
  1148. /* Is there an EOL or CR character inside tag ?? */
  1149. PreviousNumberOfLinesRead = NumberOfLinesRead;
  1150. }
  1151. /*----------------------------------------------------------------------
  1152. PutInBuffer put character c in the input buffer.
  1153. ----------------------------------------------------------------------*/
  1154. static void PutInBuffer (unsigned char c)
  1155. {
  1156. int len;
  1157. /* put the character into the buffer if it is not an ignored char. */
  1158. if ((int) c == TAB) /* HT */
  1159. len = 8; /* HT = 8 spaces */
  1160. else
  1161. len = 1;
  1162. if (c != EOS)
  1163. {
  1164. if (LgBuffer + len >= AllmostFullBuffer &&
  1165. // simplete text or cdata
  1166. (currentState == 0 || currentState == 24))
  1167. TextToDocument ();
  1168. if (LgBuffer + len >= MaxBufferLength)
  1169. {
  1170. if (currentState == 0)
  1171. TextToDocument ();
  1172. else if (currentState == 6)
  1173. {
  1174. TruncatedAttrValue = TRUE;
  1175. EndOfAttrValue (c);
  1176. TruncatedAttrValue = FALSE;
  1177. }
  1178. else
  1179. HTMLParseError (HTMLcontext.doc, "Buffer overflow", 0);
  1180. LgBuffer = 0;
  1181. }
  1182. if (LgBuffer == 0)
  1183. BufferLineNumber = NumberOfLinesRead;
  1184. if (len == 1)
  1185. inputBuffer[LgBuffer++] = c;
  1186. else
  1187. /* HT */
  1188. do
  1189. {
  1190. inputBuffer[LgBuffer++] = SPACE;
  1191. len--;
  1192. }
  1193. while (len > 0);
  1194. }
  1195. }
  1196. /*----------------------------------------------------------------------
  1197. BlockInCharLevelElem
  1198. Element el is a block-level element. If its parent is a character-level
  1199. element, add a record in the list of block-level elements to be
  1200. checked when the document is complete.
  1201. ----------------------------------------------------------------------*/
  1202. void BlockInCharLevelElem (Element el)
  1203. {
  1204. PtrElemToBeChecked nextElTBC, elTBC;
  1205. Element parent;
  1206. ElementType elType, parentType;
  1207. parent = TtaGetParent (el);
  1208. elType = TtaGetElementType (el);
  1209. /* a <div> within a <button> is allowed */
  1210. if (elType.ElTypeNum == HTML_EL_Division)
  1211. {
  1212. parentType = TtaGetElementType (parent);
  1213. if (parentType.ElTypeNum == HTML_EL_BUTTON_)
  1214. return;
  1215. }
  1216. if (LastElemToBeChecked != NULL)
  1217. {
  1218. nextElTBC = FirstElemToBeChecked;
  1219. while (nextElTBC != NULL)
  1220. {
  1221. if (nextElTBC->Elem == el)
  1222. /* this element is already in the queue */
  1223. return;
  1224. else
  1225. nextElTBC = nextElTBC->nextElemToBeChecked;
  1226. }
  1227. }
  1228. if (parent != NULL)
  1229. if (IsCharacterLevelElement (parent))
  1230. {
  1231. elTBC = (PtrElemToBeChecked) TtaGetMemory(sizeof(ElemToBeChecked));
  1232. elTBC->Elem = el;
  1233. elTBC->nextElemToBeChecked = NULL;
  1234. if (LastElemToBeChecked == NULL)
  1235. FirstElemToBeChecked = elTBC;
  1236. else
  1237. LastElemToBeChecked->nextElemToBeChecked = elTBC;
  1238. LastElemToBeChecked = elTBC;
  1239. }
  1240. }
  1241. /*----------------------------------------------------------------------
  1242. CheckSurrounding
  1243. inserts an element Pseudo_paragraph in the abstract tree of the Thot
  1244. document if el is a leaf and is not allowed to be a child of element parent.
  1245. Return TRUE if element *el has been inserted in the tree.
  1246. ----------------------------------------------------------------------*/
  1247. static ThotBool CheckSurrounding (Element * el, Element parent)
  1248. {
  1249. ElementType parentType, newElType, elType, ancestorType;
  1250. Element newEl, ancestor, prev, prevprev;
  1251. ThotBool ret;
  1252. if (parent == NULL)
  1253. return(FALSE);
  1254. ret = FALSE;
  1255. elType = TtaGetElementType (*el);
  1256. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT || elType.ElTypeNum == HTML_EL_BR
  1257. || elType.ElTypeNum == HTML_EL_PICTURE_UNIT
  1258. || elType.ElTypeNum == HTML_EL_IFRAME
  1259. || elType.ElTypeNum == HTML_EL_IMG
  1260. || elType.ElTypeNum == HTML_EL_Input
  1261. || elType.ElTypeNum == HTML_EL_Text_Area
  1262. || IsCharacterLevelElement (*el))
  1263. {
  1264. /* the element to be inserted is a character string */
  1265. /* Search the ancestor that is not a character level element */
  1266. ancestor = parent;
  1267. while (ancestor != NULL &&
  1268. (IsCharacterLevelElement (ancestor) ||
  1269. !strcmp (TtaGetSSchemaName (TtaGetElementType(ancestor).ElSSchema), "Template")))
  1270. ancestor = TtaGetParent (ancestor);
  1271. if (ancestor != NULL)
  1272. {
  1273. ancestorType = TtaGetElementType (ancestor);
  1274. if (XhtmlCannotContainText (ancestorType) &&
  1275. !Within (HTML_EL_Option_Menu, DocumentSSchema))
  1276. /* Element ancestor cannot contain text directly. Create a */
  1277. /* Pseudo_paragraph element as the parent of the text element */
  1278. {
  1279. newElType.ElSSchema = DocumentSSchema;
  1280. newElType.ElTypeNum = HTML_EL_Pseudo_paragraph;
  1281. newEl = TtaNewElement (HTMLcontext.doc, newElType);
  1282. TtaSetElementLineNumber (newEl, NumberOfLinesRead);
  1283. /* insert the new Pseudo_paragraph element */
  1284. InsertElement (&newEl);
  1285. if (newEl != NULL)
  1286. {
  1287. /* insert the Text element in the tree */
  1288. TtaInsertFirstChild (el, newEl, HTMLcontext.doc);
  1289. BlockInCharLevelElem (newEl);
  1290. ret = TRUE;
  1291. /* if previous siblings of the new Pseudo_paragraph element
  1292. are character level elements, move them within the new
  1293. Pseudo_paragraph element */
  1294. prev = newEl;
  1295. TtaPreviousSibling (&prev);
  1296. while (prev != NULL)
  1297. {
  1298. if (!IsCharacterLevelElement (prev))
  1299. prev = NULL;
  1300. else
  1301. {
  1302. prevprev = prev; TtaPreviousSibling (&prevprev);
  1303. TtaRemoveTree (prev, HTMLcontext.doc);
  1304. TtaInsertFirstChild (&prev, newEl, HTMLcontext.doc);
  1305. prev = prevprev;
  1306. }
  1307. }
  1308. }
  1309. }
  1310. }
  1311. }
  1312. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT ||
  1313. (elType.ElTypeNum != HTML_EL_Inserted_Text &&
  1314. IsCharacterLevelElement (*el)))
  1315. /* it is a character level element */
  1316. {
  1317. parentType = TtaGetElementType (parent);
  1318. if (parentType.ElTypeNum == HTML_EL_Text_Area)
  1319. /* A basic element cannot be a child of a Text_Area */
  1320. /* create a Inserted_Text element as a child of Text_Area */
  1321. {
  1322. newElType.ElSSchema = DocumentSSchema;
  1323. newElType.ElTypeNum = HTML_EL_Inserted_Text;
  1324. newEl = TtaNewElement (HTMLcontext.doc, newElType);
  1325. TtaSetElementLineNumber (newEl, NumberOfLinesRead);
  1326. InsertElement (&newEl);
  1327. if (newEl != NULL)
  1328. {
  1329. TtaInsertFirstChild (el, newEl, HTMLcontext.doc);
  1330. ret = TRUE;
  1331. }
  1332. }
  1333. }
  1334. return ret;
  1335. }
  1336. /*----------------------------------------------------------------------
  1337. InsertElement inserts element el in the abstract tree of the
  1338. Thot document, at the current position.
  1339. ----------------------------------------------------------------------*/
  1340. static ThotBool InsertElement (Element *el)
  1341. {
  1342. ThotBool ret;
  1343. Element parent;
  1344. if (InsertSibling ())
  1345. {
  1346. if (HTMLcontext.lastElement == NULL)
  1347. parent = NULL;
  1348. else
  1349. parent = TtaGetParent (HTMLcontext.lastElement);
  1350. if (!CheckSurrounding (el, parent))
  1351. {
  1352. if (parent != NULL)
  1353. TtaInsertSibling (*el, HTMLcontext.lastElement, FALSE, HTMLcontext.doc);
  1354. else
  1355. {
  1356. TtaDeleteTree (*el, HTMLcontext.doc);
  1357. *el = NULL;
  1358. }
  1359. }
  1360. ret = TRUE;
  1361. }
  1362. else
  1363. {
  1364. if (!CheckSurrounding (el, HTMLcontext.lastElement))
  1365. TtaInsertFirstChild (el, HTMLcontext.lastElement, HTMLcontext.doc);
  1366. ret = FALSE;
  1367. }
  1368. if (*el != NULL)
  1369. {
  1370. HTMLcontext.lastElement = *el;
  1371. HTMLcontext.lastElementClosed = FALSE;
  1372. }
  1373. return ret;
  1374. }
  1375. /*----------------------------------------------------------------------
  1376. ProcessOptionElement
  1377. If multiple is FALSE, remove the SELECTED attribute from the
  1378. option element, except if it's element el.
  1379. If parsing is TRUE, associate a DefaultSelected attribute with
  1380. element option if it has a SELECTED attribute.
  1381. ----------------------------------------------------------------------*/
  1382. static void ProcessOptionElement (Element option, Element el,
  1383. Document doc, ThotBool multiple,
  1384. ThotBool parsing)
  1385. {
  1386. ElementType elType;
  1387. AttributeType attrType;
  1388. Attribute attr;
  1389. elType = TtaGetElementType (option);
  1390. attrType.AttrSSchema = elType.ElSSchema;
  1391. attrType.AttrTypeNum = HTML_ATTR_Selected;
  1392. if (!multiple && option != el)
  1393. {
  1394. /* Search the SELECTED attribute */
  1395. attr = TtaGetAttribute (option, attrType);
  1396. /* remove it if it exists */
  1397. if (attr)
  1398. TtaRemoveAttribute (option, attr, doc);
  1399. }
  1400. if (parsing)
  1401. {
  1402. attr = TtaGetAttribute (option, attrType);
  1403. if (attr != NULL)
  1404. {
  1405. attrType.AttrTypeNum = HTML_ATTR_DefaultSelected;
  1406. attr = TtaGetAttribute (option, attrType);
  1407. if (!attr)
  1408. {
  1409. /* create the DefaultSelected attribute */
  1410. attr = TtaNewAttribute (attrType);
  1411. TtaAttachAttribute (option, attr, doc);
  1412. TtaSetAttributeValue (attr, HTML_ATTR_DefaultSelected_VAL_Yes_,
  1413. option, doc);
  1414. }
  1415. }
  1416. }
  1417. }
  1418. /*----------------------------------------------------------------------
  1419. OnlyOneOptionSelected
  1420. If the option menu el is a single-choice menu, check that only
  1421. one option has an attribute Selected.
  1422. If there is no option element with an attribute Selected, put an
  1423. attribute ShowMe on the first option.
  1424. If parsing is TRUE, associate an attribute DefaultSelected with
  1425. each option having an attribute Selected.
  1426. ----------------------------------------------------------------------*/
  1427. void OnlyOneOptionSelected (Element el, Document doc, ThotBool parsing)
  1428. {
  1429. ElementType elType, opType;
  1430. Element option, menu, child, firstOption;
  1431. AttributeType attrType, attrshowMeType;
  1432. Attribute attr, showMeAttr;
  1433. ThotBool multiple;
  1434. if (el == NULL)
  1435. return;
  1436. menu = NULL;
  1437. attr = NULL;
  1438. firstOption = NULL;
  1439. elType = TtaGetElementType (el);
  1440. attrType.AttrSSchema = elType.ElSSchema;
  1441. attrType.AttrTypeNum = HTML_ATTR_Selected;
  1442. attrshowMeType.AttrSSchema = elType.ElSSchema;
  1443. attrshowMeType.AttrTypeNum = HTML_ATTR_ShowMe;
  1444. if (elType.ElTypeNum == HTML_EL_Option_Menu)
  1445. {
  1446. /* it's a menu (SELECT) */
  1447. menu = el;
  1448. /* search the first OPTION element having an attribute SELECTED */
  1449. option = TtaGetFirstChild (el);
  1450. while (option && !attr)
  1451. {
  1452. elType = TtaGetElementType (option);
  1453. if (elType.ElTypeNum == HTML_EL_Option)
  1454. {
  1455. attr = TtaGetAttribute (option, attrType);
  1456. if (!firstOption)
  1457. firstOption = option;
  1458. }
  1459. else if (elType.ElTypeNum == HTML_EL_OptGroup)
  1460. {
  1461. child = TtaGetFirstChild (option);
  1462. while (child && !attr)
  1463. {
  1464. elType = TtaGetElementType (child);
  1465. if (elType.ElTypeNum == HTML_EL_Option)
  1466. {
  1467. attr = TtaGetAttribute (child, attrType);
  1468. if (!firstOption)
  1469. firstOption = child;
  1470. }
  1471. if (attr)
  1472. option = child;
  1473. else
  1474. TtaNextSibling (&child);
  1475. }
  1476. }
  1477. if (!attr)
  1478. TtaNextSibling (&option);
  1479. }
  1480. el = option;
  1481. }
  1482. else
  1483. {
  1484. option = NULL;
  1485. do
  1486. {
  1487. if (elType.ElTypeNum == HTML_EL_Option_Menu)
  1488. menu = el;
  1489. else
  1490. {
  1491. if (elType.ElTypeNum == HTML_EL_Option)
  1492. option = el;
  1493. el = TtaGetParent (el);
  1494. if (el)
  1495. elType = TtaGetElementType (el);
  1496. }
  1497. }
  1498. while (el && !menu);
  1499. el = option;
  1500. }
  1501. if (el)
  1502. {
  1503. if (menu)
  1504. {
  1505. /* Remove the SELECTED attribute from other options in the menu */
  1506. /* if it's not a multiple-choices menu. */
  1507. /* When parsing the HTML file, associate a DefaultSelected */
  1508. /* attribute with each element having a SELECTED attribute */
  1509. attrType.AttrTypeNum = HTML_ATTR_Multiple;
  1510. multiple = (TtaGetAttribute (menu, attrType) != NULL);
  1511. if (parsing || !multiple)
  1512. {
  1513. option = TtaGetFirstChild (menu);
  1514. opType = TtaGetElementType (menu);
  1515. opType.ElTypeNum = HTML_EL_Option;
  1516. while (option)
  1517. {
  1518. elType = TtaGetElementType (option);
  1519. if (elType.ElTypeNum == HTML_EL_Option)
  1520. {
  1521. ProcessOptionElement (option, el, doc, multiple,
  1522. parsing);
  1523. if (!firstOption)
  1524. firstOption = option;
  1525. }
  1526. else
  1527. {
  1528. child = TtaSearchTypedElement (opType, SearchInTree, option);
  1529. while (child)
  1530. {
  1531. ProcessOptionElement (child, el, doc, multiple, parsing);
  1532. if (!firstOption)
  1533. firstOption = option;
  1534. // look for the next option
  1535. child = TtaSearchTypedElementInTree (opType, SearchForward, option, child);
  1536. }
  1537. }
  1538. TtaNextSibling (&option);
  1539. }
  1540. }
  1541. }
  1542. /* set this option SELECTED */
  1543. attrType.AttrTypeNum = HTML_ATTR_Selected;
  1544. attr = TtaGetAttribute (el, attrType);
  1545. if (attr == NULL)
  1546. {
  1547. /* create the SELECTED attribute */
  1548. attr = TtaNewAttribute (attrType);
  1549. TtaSetAttributeValue (attr, HTML_ATTR_Selected_VAL_Yes_, el,doc);
  1550. TtaAttachAttribute (el, attr, doc);
  1551. }
  1552. }
  1553. if (firstOption)
  1554. {
  1555. showMeAttr = TtaGetAttribute (firstOption, attrshowMeType);
  1556. if (attr)
  1557. /* there is at least one option element with a selected attribute.
  1558. Remove the ShowMe attribute from the first option element */
  1559. {
  1560. if (showMeAttr)
  1561. TtaRemoveAttribute (firstOption, showMeAttr, doc);
  1562. }
  1563. else
  1564. /* there is no option element with a selected attribute. Put
  1565. an attribute ShowMe on the first option element to display it
  1566. in the main view */
  1567. {
  1568. if (!showMeAttr)
  1569. {
  1570. showMeAttr = TtaNewAttribute (attrType);
  1571. TtaSetAttributeValue (showMeAttr, HTML_ATTR_ShowMe_VAL_Yes_,
  1572. firstOption, doc);
  1573. TtaAttachAttribute (firstOption, showMeAttr, doc);
  1574. }
  1575. }
  1576. }
  1577. }
  1578. /*----------------------------------------------------------------------
  1579. LastLeafInElement
  1580. return the last leaf element in element el.
  1581. ----------------------------------------------------------------------*/
  1582. static Element LastLeafInElement (Element el)
  1583. {
  1584. Element child, lastLeaf;
  1585. child = el;
  1586. lastLeaf = NULL;
  1587. while (child != NULL)
  1588. {
  1589. child = TtaGetLastChild (child);
  1590. if (child != NULL)
  1591. lastLeaf = child;
  1592. }
  1593. return lastLeaf;
  1594. }
  1595. /*----------------------------------------------------------------------
  1596. CheckIconLink
  1597. The element is a HTML link.
  1598. Check element attributes and load the style sheet if needed.
  1599. ----------------------------------------------------------------------*/
  1600. void CheckIconLink (Element el, Document doc, SSchema schema)
  1601. {
  1602. Attribute attr;
  1603. AttributeType attrType;
  1604. char *buff, *ptr;
  1605. int length;
  1606. /* A LINK element is complete.
  1607. If it is a link to an icon, add the icon to the page
  1608. */
  1609. attrType.AttrSSchema = schema;
  1610. attrType.AttrTypeNum = HTML_ATTR_REL;
  1611. attr = TtaGetAttribute (el, attrType);
  1612. if (attr)
  1613. {
  1614. /* get a buffer for the attribute value */
  1615. length = TtaGetTextAttributeLength (attr);
  1616. buff = (char*)TtaGetMemory (length + 1);
  1617. TtaGiveTextAttributeValue (attr, buff, &length);
  1618. ptr = strstr (buff, "icon");
  1619. if (ptr == NULL)
  1620. ptr = strstr (buff, "ICON");
  1621. if (ptr &&
  1622. DocumentMeta[doc] && DocumentMeta[doc]->method != CE_MAKEBOOK &&
  1623. DocumentMeta[doc]->link_icon == NULL)
  1624. DocumentMeta[doc]->link_icon = el;
  1625. TtaFreeMemory (buff);
  1626. }
  1627. }
  1628. /*----------------------------------------------------------------------
  1629. CheckCSSLink
  1630. The element is a HTML link.
  1631. Check element attributes and load the style sheet if needed.
  1632. ----------------------------------------------------------------------*/
  1633. void CheckCSSLink (Element el, Document doc, SSchema schema)
  1634. {
  1635. Attribute attr;
  1636. AttributeType attrType;
  1637. CSSmedia media;
  1638. char *utf8path, *buff;
  1639. int length;
  1640. /* A LINK element is complete.
  1641. If it is a link to a style sheet, load that style sheet.
  1642. */
  1643. if (IsCSSLink (el, doc))
  1644. {
  1645. /* it's a link to a style sheet */
  1646. /* get the media specification */
  1647. attrType.AttrSSchema = schema;
  1648. attrType.AttrTypeNum = HTML_ATTR_media;
  1649. attr = TtaGetAttribute (el, attrType);
  1650. if (attr)
  1651. {
  1652. length = TtaGetTextAttributeLength (attr);
  1653. buff = (char*)TtaGetMemory (length + 1);
  1654. TtaGiveTextAttributeValue (attr, buff, &length);
  1655. media = CheckMediaCSS (buff);
  1656. TtaFreeMemory (buff);
  1657. }
  1658. else
  1659. media = CSS_ALL;
  1660. /* Load that style sheet */
  1661. attrType.AttrTypeNum = HTML_ATTR_HREF_;
  1662. attr = TtaGetAttribute (el, attrType);
  1663. if (attr &&
  1664. DocumentMeta[doc] &&
  1665. DocumentMeta[doc]->method != CE_MAKEBOOK)
  1666. {
  1667. length = TtaGetTextAttributeLength (attr);
  1668. utf8path = (char*)TtaGetMemory (length + 1);
  1669. TtaGiveTextAttributeValue (attr, utf8path, &length);
  1670. /* load the stylesheet file found here ! */
  1671. buff = (char *)TtaConvertMbsToByte ((unsigned char *)utf8path,
  1672. TtaGetDefaultCharset ());
  1673. if (buff)
  1674. {
  1675. LoadStyleSheet (buff, doc, el, NULL, NULL, media, FALSE);
  1676. TtaFreeMemory (buff);
  1677. UpdateStyleList (doc, 1);
  1678. }
  1679. TtaFreeMemory (utf8path);
  1680. }
  1681. }
  1682. }
  1683. /*----------------------------------------------------------------------
  1684. RemoveEndingSpaces
  1685. If element el is a block-level element, remove all spaces contained
  1686. at the end of that element.
  1687. Return TRUE if spaces have been removed.
  1688. ----------------------------------------------------------------------*/
  1689. static ThotBool RemoveEndingSpaces (Element el)
  1690. {
  1691. int length;
  1692. ElementType elType;
  1693. Element lastLeaf;
  1694. ThotBool endingSpacesDeleted;
  1695. endingSpacesDeleted = FALSE;
  1696. elType = TtaGetElementType (el);
  1697. if (!TtaIsLeaf (elType))
  1698. /* it's a block element. */
  1699. {
  1700. /* Search the last leaf in the element's tree */
  1701. lastLeaf = LastLeafInElement (el);
  1702. if (elType.ElTypeNum == HTML_EL_Preformatted)
  1703. el = NULL;
  1704. else
  1705. {
  1706. // check if the element is within a preformatted
  1707. elType.ElTypeNum = HTML_EL_Preformatted;
  1708. el = TtaGetTypedAncestor (el, elType);
  1709. }
  1710. if (el == NULL && lastLeaf)
  1711. {
  1712. elType = TtaGetElementType (lastLeaf);
  1713. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT)
  1714. /* the las leaf is a TEXT element */
  1715. {
  1716. length = TtaGetTextLength (lastLeaf);
  1717. if (length > 0)
  1718. TtaRemoveFinalSpaces (lastLeaf, HTMLcontext.doc, TRUE);
  1719. }
  1720. }
  1721. endingSpacesDeleted = TRUE;
  1722. }
  1723. return endingSpacesDeleted;
  1724. }
  1725. /*----------------------------------------------------------------------
  1726. CloseElement
  1727. End of HTML element defined in entry entry of pHTMLGIMapping.
  1728. Terminate all corresponding Thot elements.
  1729. If start < 0, an explicit end tag has been encountered in the HTML file,
  1730. else the end of element is implied by the beginning of an element
  1731. described by entry start of pHTMLGIMapping.
  1732. ----------------------------------------------------------------------*/
  1733. static ThotBool CloseElement (int entry, int start, ThotBool onStartTag)
  1734. {
  1735. int i;
  1736. ElementType elType, parentType;
  1737. Element el, parent;
  1738. ThotBool ret, stop, spacesDeleted;
  1739. int error;
  1740. ret = FALSE;
  1741. /* the closed HTML element corresponds to a Thot element. */
  1742. stop = FALSE;
  1743. /* type of the element to be closed */
  1744. elType.ElSSchema = DocumentSSchema;
  1745. elType.ElTypeNum = pHTMLGIMapping[entry].ThotType;
  1746. if (StackLevel > 0)
  1747. {
  1748. el = HTMLcontext.lastElement;
  1749. if (HTMLcontext.lastElementClosed)
  1750. el = TtaGetParent (el);
  1751. i = StackLevel - 1;
  1752. if (start < 0)
  1753. /* Explicit close */
  1754. {
  1755. /* If we meet the end tag of a form, font or center
  1756. looks for that element in the stack, but not at
  1757. a higher level as a table element */
  1758. if (!onStartTag &&
  1759. (!strcmp (pHTMLGIMapping[entry].XMLname, "form") ||
  1760. !strcmp (pHTMLGIMapping[entry].XMLname, "font") ||
  1761. !strcmp (pHTMLGIMapping[entry].XMLname, "center")))
  1762. while (i > 0 && entry != GINumberStack[i] && !stop)
  1763. if (!strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "tbody") ||
  1764. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "tr") ||
  1765. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "th") ||
  1766. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "td"))
  1767. {
  1768. /* ignore this end tag */
  1769. ret = FALSE;
  1770. stop = TRUE;
  1771. i = -1;
  1772. }
  1773. else
  1774. i--;
  1775. else
  1776. /* looks in the stack for the element to be closed */
  1777. while (i >= 0 && entry != GINumberStack[i])
  1778. i--;
  1779. }
  1780. else
  1781. /* Implicit close */
  1782. {
  1783. /* If the element to be closed is a list item (or
  1784. equivalent), looks for that element in the
  1785. stack, but not at a higher level as the list (or
  1786. equivalent) element */
  1787. if (!strcmp (pHTMLGIMapping[start].XMLname, "li"))
  1788. {
  1789. while (i > 0 && entry != GINumberStack[i] && !stop)
  1790. if (!strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "ol") ||
  1791. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "ul") ||
  1792. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "dir") ||
  1793. !strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "menu"))
  1794. stop = TRUE;
  1795. else
  1796. i--;
  1797. }
  1798. else if (!strcmp (pHTMLGIMapping[start].XMLname, "option"))
  1799. {
  1800. while (i > 0 && entry != GINumberStack[i] && !stop)
  1801. if (!strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "select"))
  1802. stop = TRUE;
  1803. else
  1804. i--;
  1805. }
  1806. else if (!strcmp (pHTMLGIMapping[start].XMLname, "dd") ||
  1807. !strcmp (pHTMLGIMapping[start].XMLname, "dt"))
  1808. {
  1809. while (i > 0 && entry != GINumberStack[i] && !stop)
  1810. if (!strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "dl"))
  1811. stop = TRUE;
  1812. else
  1813. i--;
  1814. }
  1815. else if (!strcmp (pHTMLGIMapping[start].XMLname, "tr") ||
  1816. !strcmp (pHTMLGIMapping[start].XMLname, "td") ||
  1817. !strcmp (pHTMLGIMapping[start].XMLname, "th"))
  1818. {
  1819. while (i > 0 && entry != GINumberStack[i] && !stop)
  1820. if (!strcmp (pHTMLGIMapping[GINumberStack[i]].XMLname, "table"))
  1821. stop = TRUE;
  1822. else
  1823. i--;
  1824. }
  1825. }
  1826. if (i >= 0 && entry == GINumberStack[i])
  1827. /* element found in the stack */
  1828. {
  1829. /* This element and its whole subtree are closed */
  1830. StackLevel = i;
  1831. HTMLcontext.lastElement = ElementStack[i];
  1832. HTMLcontext.lastElementClosed = TRUE;
  1833. ret = TRUE;
  1834. }
  1835. else if (!stop)
  1836. /* element not found in the stack */
  1837. if (start >= 0 && HTMLcontext.lastElement != NULL)
  1838. {
  1839. /* implicit close. Check the parent of current element */
  1840. if (InsertSibling ())
  1841. parent = TtaGetParent (HTMLcontext.lastElement);
  1842. else
  1843. parent = HTMLcontext.lastElement;
  1844. if (parent != NULL)
  1845. {
  1846. parentType = TtaGetElementType (parent);
  1847. if (elType.ElTypeNum == parentType.ElTypeNum)
  1848. {
  1849. HTMLcontext.lastElement = parent;
  1850. HTMLcontext.lastElementClosed = TRUE;
  1851. ret = TRUE;
  1852. }
  1853. else if (TtaIsLeaf (TtaGetElementType (HTMLcontext.lastElement)))
  1854. {
  1855. parent = TtaGetParent (parent);
  1856. if (parent != NULL)
  1857. {
  1858. parentType = TtaGetElementType (parent);
  1859. if (elType.ElTypeNum == parentType.ElTypeNum)
  1860. {
  1861. HTMLcontext.lastElement = parent;
  1862. HTMLcontext.lastElementClosed = TRUE;
  1863. ret = TRUE;
  1864. }
  1865. }
  1866. }
  1867. }
  1868. }
  1869. if (ret)
  1870. /* successful close */
  1871. {
  1872. /* remove closed elements from the stack */
  1873. while (i > 0)
  1874. if (ElementStack[i] == HTMLcontext.lastElement)
  1875. {
  1876. StackLevel = i;
  1877. i = 0;
  1878. }
  1879. else
  1880. {
  1881. if (TtaIsAncestor (ElementStack[i], HTMLcontext.lastElement))
  1882. StackLevel = i;
  1883. i--;
  1884. }
  1885. if (StackLevel > 0)
  1886. HTMLcontext.language = LanguageStack[StackLevel - 1];
  1887. /* complete all closed elements */
  1888. if (el != HTMLcontext.lastElement)
  1889. if (!TtaIsAncestor(el, HTMLcontext.lastElement))
  1890. el = NULL;
  1891. spacesDeleted = FALSE;
  1892. while (el != NULL)
  1893. {
  1894. XhtmlElementComplete (&HTMLcontext, el, &error);
  1895. elType = TtaGetElementType (el);
  1896. if (elType.ElTypeNum == HTML_EL_Table_)
  1897. HTMLcontext.withinTable--;
  1898. if (!spacesDeleted)
  1899. /* If the element closed is a block-element, remove */
  1900. /* spaces contained at the end of that element */
  1901. spacesDeleted = RemoveEndingSpaces (el);
  1902. if (el == HTMLcontext.lastElement)
  1903. el = NULL;
  1904. else
  1905. el = TtaGetParent (el);
  1906. }
  1907. }
  1908. }
  1909. return ret;
  1910. }
  1911. /*----------------------------------------------------------------------
  1912. MapAttrValue search in AttrValueMappingTable the entry for
  1913. the attribute thotAttr and its value attrVal. Returns the corresponding
  1914. Thot value.
  1915. ----------------------------------------------------------------------*/
  1916. int MapAttrValue (int thotAttr, char* attrVal)
  1917. {
  1918. int i, value;
  1919. value = -1;
  1920. i = 0;
  1921. while (XhtmlAttrValueMappingTable[i].ThotAttr != thotAttr &&
  1922. XhtmlAttrValueMappingTable[i].ThotAttr != 0)
  1923. i++;
  1924. if (XhtmlAttrValueMappingTable[i].ThotAttr == thotAttr)
  1925. do
  1926. if (attrVal[1] == EOS && (thotAttr == HTML_ATTR_NumberStyle ||
  1927. thotAttr == HTML_ATTR_ItemStyle))
  1928. /* attributes NumberStyle (which is always 1 character long) */
  1929. /* and ItemStyle (only when its length is 1) are */
  1930. /* case sensistive. Compare their exact value */
  1931. if (attrVal[0] == XhtmlAttrValueMappingTable[i].XMLattrValue[0])
  1932. value = XhtmlAttrValueMappingTable[i].ThotAttrValue;
  1933. else
  1934. i++;
  1935. else
  1936. /* for other attributes, uppercase and lowercase are */
  1937. /* equivalent */
  1938. if (!strcasecmp ((char *)XhtmlAttrValueMappingTable[i].XMLattrValue, (char *)attrVal))
  1939. value = XhtmlAttrValueMappingTable[i].ThotAttrValue;
  1940. else
  1941. i++;
  1942. while (value < 0 && XhtmlAttrValueMappingTable[i].ThotAttr == thotAttr);
  1943. return value;
  1944. }
  1945. /*----------------------------------------------------------------------
  1946. StopParsing
  1947. Stops the document parsing when an unrecoverable error is found
  1948. ----------------------------------------------------------------------*/
  1949. static void StopParsing (Document doc)
  1950. {
  1951. NormalTransition = FALSE;
  1952. HTMLrootClosed = TRUE;
  1953. CurrentBufChar = 0;
  1954. }
  1955. /*----------------------------------------------------------------------
  1956. InsertInvalidEl
  1957. create an Invalid_element element or a Unknown element.
  1958. badposition indicate whether the element type is unknown (FALSE) or the
  1959. tag position is incorrect (TRUE).
  1960. ----------------------------------------------------------------------*/
  1961. static void InsertInvalidEl (char* content, ThotBool badposition)
  1962. {
  1963. ElementType elType;
  1964. Element elInv, elText;
  1965. elType.ElSSchema = DocumentSSchema;
  1966. if (badposition)
  1967. elType.ElTypeNum = HTML_EL_Invalid_element;
  1968. else
  1969. elType.ElTypeNum = HTML_EL_Unknown_namespace;
  1970. elInv = TtaNewElement (HTMLcontext.doc, elType);
  1971. TtaSetElementLineNumber (elInv, NumberOfLinesRead);
  1972. InsertElement (&elInv);
  1973. if (elInv)
  1974. {
  1975. HTMLcontext.lastElementClosed = TRUE;
  1976. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  1977. elText = TtaNewElement (HTMLcontext.doc, elType);
  1978. TtaSetElementLineNumber (elText, NumberOfLinesRead);
  1979. TtaInsertFirstChild (&elText, elInv, HTMLcontext.doc);
  1980. TtaSetTextContent (elText, (unsigned char *)content, HTMLcontext.language, HTMLcontext.doc);
  1981. InitBuffer ();
  1982. if (!UnknownTag)
  1983. /* close the end tag */
  1984. TtaAppendTextContent (elText, (unsigned char *)">", HTMLcontext.doc);
  1985. if (badposition)
  1986. TtaSetAccessRight (elInv, ReadOnly, HTMLcontext.doc);
  1987. }
  1988. }
  1989. /*----------------------------------------------------------------------
  1990. EndOfStartTag a ">" has been read. It indicates the end
  1991. of a start tag.
  1992. ----------------------------------------------------------------------*/
  1993. static void EndOfStartTag (char c)
  1994. {
  1995. Element elText;
  1996. ElementType elType;
  1997. AttributeType attrType;
  1998. Attribute attr;
  1999. char *text;
  2000. int length, error;
  2001. if (UnknownTag)
  2002. {
  2003. if (HTMLcontext.lastElement)
  2004. {
  2005. CloseBuffer ();
  2006. elType = TtaGetElementType (HTMLcontext.lastElement);
  2007. if (elType.ElTypeNum == HTML_EL_Invalid_element ||
  2008. elType.ElTypeNum == HTML_EL_Unknown_namespace)
  2009. {
  2010. elText = TtaGetLastChild (HTMLcontext.lastElement);
  2011. if (LgBuffer > 0)
  2012. TtaAppendTextContent (elText, (unsigned char *)inputBuffer,
  2013. HTMLcontext.doc);
  2014. TtaAppendTextContent (elText, (unsigned char *)">",
  2015. HTMLcontext.doc);
  2016. }
  2017. InitBuffer ();
  2018. }
  2019. UnknownTag = FALSE;
  2020. }
  2021. if (HTMLcontext.lastElement && lastElemEntry != -1)
  2022. {
  2023. if (!strcmp (pHTMLGIMapping[lastElemEntry].XMLname, "pre") ||
  2024. !strcmp (pHTMLGIMapping[lastElemEntry].XMLname, "style") ||
  2025. !strcmp (pHTMLGIMapping[lastElemEntry].XMLname, "script"))
  2026. /* a <PRE>, <STYLE> or <SCRIPT> tag has been read */
  2027. AfterTagPRE = TRUE;
  2028. else if (!strcmp (pHTMLGIMapping[lastElemEntry].XMLname, "table"))
  2029. /* <TABLE> has been read */
  2030. HTMLcontext.withinTable++;
  2031. else if (pHTMLGIMapping[lastElemEntry].XMLcontents == 'E')
  2032. /* this is an empty element. Do not expect an end tag */
  2033. {
  2034. CloseElement (lastElemEntry, -1, TRUE);
  2035. XhtmlElementComplete (&HTMLcontext, HTMLcontext.lastElement, &error);
  2036. }
  2037. /* if it's an AREA element, computes its position and size */
  2038. ParseAreaCoords (HTMLcontext.lastElement, HTMLcontext.doc);
  2039. /* if it's a STYLE element in CSS notation, activate the CSS */
  2040. /* parser for parsing the element content */
  2041. elType = TtaGetElementType (HTMLcontext.lastElement);
  2042. if (elType.ElTypeNum == HTML_EL_STYLE_)
  2043. {
  2044. /* Search the Notation attribute */
  2045. attrType.AttrSSchema = elType.ElSSchema;
  2046. attrType.AttrTypeNum = HTML_ATTR_Notation;
  2047. attr = TtaGetAttribute (HTMLcontext.lastElement, attrType);
  2048. if (attr == NULL)
  2049. /* No Notation attribute. Assume CSS by default */
  2050. HTMLcontext.parsingCSS = TRUE;
  2051. else
  2052. /* the STYLE element has a Notation attribute */
  2053. /* get its value */
  2054. {
  2055. length = TtaGetTextAttributeLength (attr);
  2056. text = (char*)TtaGetMemory (length + 1);
  2057. TtaGiveTextAttributeValue (attr, text, &length);
  2058. if (!strcasecmp ((char *)text, "text/css"))
  2059. HTMLcontext.parsingCSS = TRUE;
  2060. TtaFreeMemory (text);
  2061. }
  2062. }
  2063. else if (elType.ElTypeNum == HTML_EL_Text_Area)
  2064. /* we have to read the content as a simple text unit */
  2065. HTMLcontext.parsingTextArea = TRUE;
  2066. else if (elType.ElTypeNum == HTML_EL_SCRIPT_)
  2067. /* we have to read the content as a simple text unit */
  2068. HTMLcontext.parsingScript = TRUE;
  2069. }
  2070. if (c == '<')
  2071. {
  2072. HTMLParseError (HTMLcontext.doc, "Syntax error", 0);
  2073. StartOfTag (c);
  2074. }
  2075. }
  2076. /*----------------------------------------------------------------------
  2077. ContextOK returns TRUE if the element at position entry
  2078. in the mapping table is allowed to occur in the
  2079. current structural context.
  2080. ----------------------------------------------------------------------*/
  2081. static ThotBool ContextOK (int entry)
  2082. {
  2083. ThotBool ok;
  2084. int saveLastElemEntry;
  2085. if (StackLevel == 0 || GINumberStack[StackLevel - 1] < 0)
  2086. return TRUE;
  2087. else
  2088. {
  2089. ok = TRUE;
  2090. if (!strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "tr") &&
  2091. strcmp (pHTMLGIMapping[entry].XMLname, "th") &&
  2092. strcmp (pHTMLGIMapping[entry].XMLname, "td"))
  2093. /* only TH and TD elements are allowed as children of a TR element */
  2094. ok = FALSE;
  2095. if (ok &&
  2096. !strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "table") &&
  2097. strcmp (pHTMLGIMapping[entry].XMLname, "caption") &&
  2098. strcmp (pHTMLGIMapping[entry].XMLname, "thead") &&
  2099. strcmp (pHTMLGIMapping[entry].XMLname, "tfoot") &&
  2100. strcmp (pHTMLGIMapping[entry].XMLname, "tbody") &&
  2101. strcmp (pHTMLGIMapping[entry].XMLname, "colgroup") &&
  2102. strcmp (pHTMLGIMapping[entry].XMLname, "col") &&
  2103. strcmp (pHTMLGIMapping[entry].XMLname, "tr"))
  2104. {
  2105. /* only CAPTION, THEAD, TFOOT, TBODY, COLGROUP, COL and TR are */
  2106. /* allowed as children of a TABLE element */
  2107. if (!strcmp (pHTMLGIMapping[entry].XMLname, "td") ||
  2108. !strcmp (pHTMLGIMapping[entry].XMLname, "th"))
  2109. /* Table cell within a table, without a tr. Assume tr */
  2110. {
  2111. /* save the last last identifier read from the input file */
  2112. saveLastElemEntry = lastElemEntry;
  2113. /* simulate a <TR> tag */
  2114. ProcessStartGI ("tr");
  2115. /* restore the last tag that has actually been read */
  2116. lastElemEntry = saveLastElemEntry;
  2117. }
  2118. else
  2119. ok = FALSE;
  2120. }
  2121. if (ok &&
  2122. (!strcmp (pHTMLGIMapping[entry].XMLname, "caption") ||
  2123. !strcmp (pHTMLGIMapping[entry].XMLname, "thead") ||
  2124. !strcmp (pHTMLGIMapping[entry].XMLname, "tfoot") ||
  2125. !strcmp (pHTMLGIMapping[entry].XMLname, "tbody") ||
  2126. !strcmp (pHTMLGIMapping[entry].XMLname, "colgroup")) &&
  2127. strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "table"))
  2128. /* CAPTION, THEAD, TFOOT, TBODY, COLGROUP are allowed only as
  2129. children of a TABLE element */
  2130. ok = FALSE;
  2131. if (ok &&
  2132. (!strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "thead") ||
  2133. !strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "tfoot") ||
  2134. !strcmp (pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLname, "tbody")) &&
  2135. strcmp (pHTMLGIMapping[entry].XMLname, "tr"))
  2136. /* only TR is allowed as a child of a THEAD, TFOOT or TBODY element */
  2137. {
  2138. if (!strcmp (pHTMLGIMapping[entry].XMLname, "td") ||
  2139. !strcmp (pHTMLGIMapping[entry].XMLname, "th"))
  2140. /* Table cell within a thead, tfoot or tbody without a tr. */
  2141. /* Assume tr */
  2142. {
  2143. /* save the last last identifier read from the input file */
  2144. saveLastElemEntry = lastElemEntry;
  2145. /* simulate a <tr> tag */
  2146. ProcessStartGI ("tr");
  2147. /* restore the last tag that has actually been read */
  2148. lastElemEntry = saveLastElemEntry;
  2149. }
  2150. else
  2151. ok = FALSE;
  2152. }
  2153. if (ok)
  2154. /* refuse HEAD within HEAD */
  2155. if (strcmp (pHTMLGIMapping[entry].XMLname, "head") == 0)
  2156. if (Within (HTML_EL_HEAD, DocumentSSchema))
  2157. ok = FALSE;
  2158. if (ok)
  2159. /* refuse STYLE within STYLE */
  2160. if (strcmp (pHTMLGIMapping[entry].XMLname, "style") == 0)
  2161. if (Within (HTML_EL_STYLE_, DocumentSSchema))
  2162. ok = FALSE;
  2163. return ok;
  2164. }
  2165. }
  2166. /*----------------------------------------------------------------------
  2167. SpecialImplicitEnd
  2168. ----------------------------------------------------------------------*/
  2169. static void SpecialImplicitEnd (int entry)
  2170. {
  2171. ElementType elType;
  2172. /* if current element is DD, Hn closes that DD only when there is */
  2173. /* no enclosing DL */
  2174. if (pHTMLGIMapping[entry].XMLname[0] == 'H' &&
  2175. pHTMLGIMapping[entry].XMLname[1] >= '1' &&
  2176. pHTMLGIMapping[entry].XMLname[1] <= '6' &&
  2177. pHTMLGIMapping[entry].XMLname[2] == EOS)
  2178. /* the new element is a Hn */
  2179. if (StackLevel > 1)
  2180. if (ElementStack[StackLevel - 1] != NULL)
  2181. {
  2182. elType = TtaGetElementType (ElementStack[StackLevel - 1]);
  2183. if (elType.ElTypeNum == HTML_EL_Definition)
  2184. /* the current element is a DD */
  2185. {
  2186. elType = TtaGetElementType (ElementStack[StackLevel - 2]);
  2187. if (elType.ElTypeNum != HTML_EL_Definition_List)
  2188. /* DD in not within a DL. Close the DD element */
  2189. CloseElement (GINumberStack[StackLevel - 1], entry, FALSE);
  2190. }
  2191. }
  2192. }
  2193. /*----------------------------------------------------------------------
  2194. ProcessStartGI An HTML GI has been read in a start tag.
  2195. Create the corresponding Thot thing (element, attribute,
  2196. or character), according to the mapping table.
  2197. ----------------------------------------------------------------------*/
  2198. static void ProcessStartGI (const char* GIname)
  2199. {
  2200. ElementType elType;
  2201. Element el;
  2202. int entry, i;
  2203. char msgBuffer[MaxMsgLength];
  2204. PtrClosedElement pClose;
  2205. ThotBool sameLevel, removed, error;
  2206. SSchema schema;
  2207. /* ignore tag <P> within PRE */
  2208. if (Within (HTML_EL_Preformatted, DocumentSSchema))
  2209. if (strcasecmp ((char *)GIname, "p") == 0)
  2210. return;
  2211. /* search the HTML element name in the mapping table */
  2212. schema = DocumentSSchema;
  2213. entry = MapGI ((char *)GIname, &schema, HTMLcontext.doc);
  2214. lastElemEntry = entry;
  2215. if (entry < 0)
  2216. /* not found in the HTML DTD */
  2217. {
  2218. /* check if it's the math or svg tag with a namespace prefix */
  2219. /* So, look for a colon in the element name */
  2220. for (i = 0; GIname[i] != ':' && GIname[i] != EOS; i++);
  2221. if (GIname[i] == ':' &&
  2222. (strcasecmp ((char *)&GIname[i+1], "math") == 0 ||
  2223. strcasecmp ((char *)&GIname[i+1], "xmlgraphics") == 0 ||
  2224. strcasecmp ((char *)&GIname[i+1], "svg") == 0))
  2225. /* it's a math or svg tag with a namespace prefix. OK */
  2226. {
  2227. entry = MapGI ((char *)&GIname[i+1], &schema, HTMLcontext.doc);
  2228. lastElemEntry = entry;
  2229. }
  2230. else
  2231. /* unknown tag */
  2232. {
  2233. UnknownTag = TRUE;
  2234. if (DocumentMeta[HTMLcontext.doc] &&
  2235. DocumentMeta[HTMLcontext.doc]->xmlformat)
  2236. {
  2237. snprintf (msgBuffer, MaxMsgLength,
  2238. "Invalid tag <%s> (removed when saving)", GIname);
  2239. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2240. removed = TRUE;
  2241. }
  2242. else
  2243. {
  2244. snprintf (msgBuffer, MaxMsgLength, "Warning - unknown tag <%s>",
  2245. GIname);
  2246. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2247. removed = FALSE;
  2248. }
  2249. /* create an Invalid_element */
  2250. snprintf (msgBuffer, MaxMsgLength, "<%s", GIname);
  2251. InsertInvalidEl (msgBuffer, removed);
  2252. }
  2253. }
  2254. if (entry >= 0)
  2255. {
  2256. if (TtaGetDocumentProfile(HTMLcontext.doc) != L_Other &&
  2257. !(pHTMLGIMapping[entry].Level &
  2258. TtaGetDocumentProfile(HTMLcontext.doc)))
  2259. {
  2260. /* Invalid element for the document profile */
  2261. /* don't process that element */
  2262. snprintf (msgBuffer, MaxMsgLength,
  2263. "Invalid element <%s> for the document profile", GIname);
  2264. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2265. XMLErrorsFoundInProfile = TRUE;
  2266. UnknownTag = TRUE;
  2267. }
  2268. else
  2269. {
  2270. if (HTMLcontext.withinTable == 0 &&
  2271. (!strcmp (pHTMLGIMapping[entry].XMLname, "td") ||
  2272. !strcmp (pHTMLGIMapping[entry].XMLname, "th")))
  2273. {
  2274. sprintf (msgBuffer, "Tags <table>, <tbody> and <tr> added");
  2275. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2276. /* generate mandatory parent elements */
  2277. ProcessStartGI ("table");
  2278. HTMLcontext.withinTable = 1;
  2279. ProcessStartGI ("tr");
  2280. }
  2281. else if (HTMLcontext.withinTable == 0 &&
  2282. !strcmp (pHTMLGIMapping[entry].XMLname, "tr"))
  2283. {
  2284. /* generate mandatory parent elements */
  2285. sprintf (msgBuffer, "Tags <table> and <tbody> added");
  2286. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2287. ProcessStartGI ("table");
  2288. }
  2289. /* does this start tag also imply the end tag of some current elements?*/
  2290. pClose = FirstClosedElem[entry];
  2291. while (pClose != NULL)
  2292. {
  2293. CloseElement (pClose->tagNum, entry, TRUE);
  2294. pClose = pClose->nextClosedElem;
  2295. }
  2296. /* process some special cases... */
  2297. SpecialImplicitEnd (entry);
  2298. error = !ContextOK (entry);
  2299. if (error)
  2300. /* element not allowed in the current structural context */
  2301. {
  2302. /* send an error message */
  2303. snprintf (msgBuffer, MaxMsgLength,
  2304. "Tag <%s> is not allowed here (removed when saving)",
  2305. GIname);
  2306. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2307. /* if it's a <script> tag, process it normally to avoid its
  2308. content to be considered as plain text */
  2309. if (!strcmp (pHTMLGIMapping[entry].XMLname, "script"))
  2310. error = FALSE;
  2311. }
  2312. if (error)
  2313. {
  2314. UnknownTag = TRUE;
  2315. /* create an Invalid_element */
  2316. snprintf (msgBuffer, MaxMsgLength, "<%s", GIname);
  2317. InsertInvalidEl (msgBuffer, TRUE);
  2318. }
  2319. else
  2320. {
  2321. el = NULL;
  2322. sameLevel = TRUE;
  2323. if (pHTMLGIMapping[entry].ThotType > 0)
  2324. {
  2325. /* create a Thot element */
  2326. elType.ElSSchema = DocumentSSchema;
  2327. elType.ElTypeNum = pHTMLGIMapping[entry].ThotType;
  2328. if (pHTMLGIMapping[entry].XMLcontents == 'E')
  2329. /* empty HTML element. Create all children specified */
  2330. /* in the Thot structure schema */
  2331. el = TtaNewTree (HTMLcontext.doc, elType, "");
  2332. else
  2333. /* the HTML element may have children. Create only */
  2334. /* the corresponding Thot element, without any child */
  2335. el = TtaNewElement (HTMLcontext.doc, elType);
  2336. TtaSetElementLineNumber (el, NumberOfLinesRead);
  2337. sameLevel = InsertElement (&el);
  2338. if (el != NULL)
  2339. {
  2340. if (pHTMLGIMapping[entry].XMLcontents == 'E')
  2341. HTMLcontext.lastElementClosed = TRUE;
  2342. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT)
  2343. /* an empty Text element has been created. The */
  2344. /* following character data must go to that elem. */
  2345. HTMLcontext.mergeText = TRUE;
  2346. }
  2347. }
  2348. if (pHTMLGIMapping[entry].XMLcontents != 'E')
  2349. {
  2350. if (StackLevel >= MaxStack - 1)
  2351. HTMLParseError (HTMLcontext.doc, "Too many nested elements", 0);
  2352. else
  2353. {
  2354. ElementStack[StackLevel] = el;
  2355. if (sameLevel)
  2356. ThotLevel[StackLevel] = ThotLevel[StackLevel - 1];
  2357. else
  2358. ThotLevel[StackLevel] = ThotLevel[StackLevel - 1] + 1;
  2359. LanguageStack[StackLevel] = HTMLcontext.language;
  2360. GINumberStack[StackLevel++] = entry;
  2361. }
  2362. }
  2363. }
  2364. }
  2365. }
  2366. }
  2367. /*----------------------------------------------------------------------
  2368. EndOfStartGI An HTML GI has been read in a start tag.
  2369. ----------------------------------------------------------------------*/
  2370. static void EndOfStartGI (char c)
  2371. {
  2372. char schemaName[20];
  2373. char theGI[MaxMsgLength];
  2374. char *tagName;
  2375. int i;
  2376. if (HTMLcontext.parsingTextArea || HTMLcontext.parsingScript)
  2377. /* We are parsing the contents of a TEXTAREA or SCRIPT element. If a start
  2378. tag appears, consider it as plain text */
  2379. {
  2380. /* next state is state 0, not the state computed by the automaton */
  2381. NormalTransition = FALSE;
  2382. currentState = 0;
  2383. /* put a '<' and the tagname (GI) in the input buffer */
  2384. for (i = LgBuffer; i > 0; i--)
  2385. inputBuffer[i] = inputBuffer[i - 1];
  2386. LgBuffer++;
  2387. inputBuffer[0] = '<';
  2388. inputBuffer[LgBuffer] = EOS;
  2389. /* copy the input buffer in the document */
  2390. TextToDocument ();
  2391. }
  2392. else
  2393. {
  2394. /* if the last character in the GI is a '/', ignore it. This is to
  2395. accept the XML syntax for empty elements, for instance <br/> */
  2396. if (LgBuffer > 0 && inputBuffer[LgBuffer-1] == '/')
  2397. LgBuffer--;
  2398. CloseBuffer ();
  2399. strncpy ((char *)theGI, (char *)inputBuffer, MaxMsgLength - 1);
  2400. theGI[MaxMsgLength - 1] = EOS;
  2401. InitBuffer ();
  2402. if (HTMLcontext.lastElementClosed &&
  2403. HTMLcontext.lastElement == rootElement)
  2404. /* an element after the tag </html>, ignore it */
  2405. {
  2406. HTMLParseError (HTMLcontext.doc, "Element after tag </html>. Ignored", 0);
  2407. return;
  2408. }
  2409. /* if it's a "math" or "svg" tag, it may have a namespace name */
  2410. tagName = theGI;
  2411. for (i = 0; theGI[i] != ':' && theGI[i] != EOS; i++);
  2412. if (theGI[i] == ':' &&
  2413. (strcasecmp ((char *)&theGI[i+1], "math") == 0 ||
  2414. strcasecmp ((char *)&theGI[i+1], "svg") == 0))
  2415. /* it's a math or svg tag with a namespace prefix. ignore the prefix */
  2416. tagName = &theGI[i+1];
  2417. if (!strcmp (tagName, "math") || !strcmp (tagName, "svg"))
  2418. /* a <math> or <svg> tag has been read */
  2419. {
  2420. /* get back to the beginning of the tag in the input buffer */
  2421. /* "NotToReadFile" boolean means that we get back in the */
  2422. /* previous input buffer */
  2423. /* That case happens when the "<" and ">" characters for that */
  2424. /* tag have not been read in the same input buffer */
  2425. if (StartOfTagIndx <= 0 ||
  2426. (StartOfTagIndx > CurrentBufChar && CurrentBufChar != 0))
  2427. {
  2428. NumberOfCharRead = PreviousNumberOfCharRead;
  2429. NumberOfLinesRead = PreviousNumberOfLinesRead;
  2430. NotToReadFile = TRUE;
  2431. if (StartOfTagIndx < 0)
  2432. CurrentBufChar = LastCharInPreviousRead;
  2433. else
  2434. CurrentBufChar = StartOfTagIndx;
  2435. }
  2436. else
  2437. CurrentBufChar = StartOfTagIndx;
  2438. if (!strcmp ((char *)tagName, (char *)"math"))
  2439. strcpy ((char *)schemaName, (char *)"MathML");
  2440. else
  2441. strcpy ((char *)schemaName, (char *)"SVG");
  2442. /* Parse the corresponding element with the XML parser */
  2443. if (!ParseIncludedXml ((FILE *)stream, &WorkBuffer, INPUT_FILE_BUFFER_SIZE,
  2444. &EndOfHtmlFile, &NotToReadFile,
  2445. PreviousRead, &LastCharInWorkBuffer,
  2446. InputText, &CurrentBufChar,
  2447. &NumberOfLinesRead, &NumberOfCharRead,
  2448. schemaName, HTMLcontext.doc,
  2449. &HTMLcontext.lastElement,
  2450. &HTMLcontext.lastElementClosed,
  2451. HTMLcontext.language))
  2452. StopParsing (HTMLcontext.doc); /* the XML parser raised an error */
  2453. /* the whole element has been read by the XML parser */
  2454. /* reset the automaton state */
  2455. NormalTransition = FALSE;
  2456. currentState = 0;
  2457. CharProcessed = TRUE;
  2458. }
  2459. else
  2460. ProcessStartGI (tagName);
  2461. }
  2462. }
  2463. /*----------------------------------------------------------------------
  2464. EndOfStartGIandTag a ">" has been read. It indicates the
  2465. end of a GI and the end of a start tag.
  2466. ----------------------------------------------------------------------*/
  2467. static void EndOfStartGIandTag (char c)
  2468. {
  2469. EndOfStartGI (c);
  2470. EndOfStartTag (c);
  2471. if (c == '<')
  2472. {
  2473. HTMLParseError (HTMLcontext.doc, "Syntax error", 0);
  2474. StartOfTag (c);
  2475. }
  2476. }
  2477. /*----------------------------------------------------------------------
  2478. StartCData a new CDATA element (<![CDATA[)
  2479. ----------------------------------------------------------------------*/
  2480. static void StartCData (char c)
  2481. {
  2482. ElementType elType;
  2483. Element el, child;
  2484. CloseBuffer ();
  2485. if (!strcasecmp ((char *)inputBuffer, "cdata"))
  2486. {
  2487. elType.ElSSchema = DocumentSSchema;
  2488. elType.ElTypeNum = HTML_EL_CDATA;
  2489. el = TtaNewElement (HTMLcontext.doc, elType);
  2490. TtaSetElementLineNumber (el, NumberOfLinesRead);
  2491. InsertElement (&el);
  2492. elType.ElTypeNum = HTML_EL_CDATA_line;
  2493. child = TtaNewTree (HTMLcontext.doc, elType, "");
  2494. TtaSetElementLineNumber (child, NumberOfLinesRead);
  2495. TtaInsertFirstChild (&child, el, HTMLcontext.doc);
  2496. HTMLcontext.lastElement = TtaGetFirstChild (child);
  2497. /* clear the input buffer */
  2498. InitBuffer ();
  2499. }
  2500. }
  2501. /*----------------------------------------------------------------------
  2502. CloseCDataLine closes a CDATA line.
  2503. ----------------------------------------------------------------------*/
  2504. static void CloseCDataLine (char c)
  2505. {
  2506. CloseBuffer ();
  2507. if (LgBuffer > 0 && inputBuffer[LgBuffer-1] == EOL)
  2508. {
  2509. LgBuffer--;
  2510. inputBuffer[LgBuffer] = EOS;
  2511. }
  2512. if (LgBuffer > 0 && inputBuffer[LgBuffer-1] == ']')
  2513. {
  2514. LgBuffer--;
  2515. inputBuffer[LgBuffer] = EOS;
  2516. }
  2517. if (LgBuffer > 0 && inputBuffer[LgBuffer-1] == ']')
  2518. {
  2519. LgBuffer--;
  2520. inputBuffer[LgBuffer] = EOS;
  2521. }
  2522. /* copy the input buffer into the document */
  2523. if (LgBuffer)
  2524. TtaAppendTextContent (HTMLcontext.lastElement, (unsigned char *)inputBuffer,
  2525. HTMLcontext.doc);
  2526. /* clear the input buffer */
  2527. InitBuffer ();
  2528. }
  2529. /*----------------------------------------------------------------------
  2530. EndOfCDataLine closes a CDATA line.
  2531. ----------------------------------------------------------------------*/
  2532. static void EndOfCDataLine (char c)
  2533. {
  2534. ElementType elType;
  2535. Element el, child;
  2536. CloseCDataLine (c);
  2537. /* start a new CDATA line */
  2538. elType.ElSSchema = DocumentSSchema;
  2539. elType.ElTypeNum = HTML_EL_CDATA_line;
  2540. child = TtaNewTree (HTMLcontext.doc, elType, "");
  2541. TtaSetElementLineNumber (child, NumberOfLinesRead);
  2542. elType = TtaGetElementType (HTMLcontext.lastElement);
  2543. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT)
  2544. el = TtaGetParent (HTMLcontext.lastElement);
  2545. else
  2546. el = HTMLcontext.lastElement;
  2547. TtaInsertSibling (child, el, FALSE, HTMLcontext.doc);
  2548. HTMLcontext.lastElement = TtaGetFirstChild (child);
  2549. }
  2550. /*----------------------------------------------------------------------
  2551. EndOfCdata closes a CDATA.
  2552. ----------------------------------------------------------------------*/
  2553. static void EndOfCData (char c)
  2554. {
  2555. ElementType elType;
  2556. CloseCDataLine (c);
  2557. HTMLcontext.lastElementClosed = TRUE;
  2558. elType.ElSSchema = DocumentSSchema;
  2559. elType.ElTypeNum = HTML_EL_CDATA;
  2560. HTMLcontext.lastElement = TtaGetTypedAncestor (HTMLcontext.lastElement, elType);
  2561. }
  2562. /*----------------------------------------------------------------------
  2563. EndOfEndTag An end tag has been read in the HTML file.
  2564. Terminate all corresponding Thot elements.
  2565. ----------------------------------------------------------------------*/
  2566. static void EndOfEndTag (char c)
  2567. {
  2568. SSchema schema;
  2569. char msgBuffer[MaxMsgLength];
  2570. int entry;
  2571. int i, profile;
  2572. ThotBool ok, removed;
  2573. CloseBuffer ();
  2574. if ((HTMLcontext.parsingTextArea &&
  2575. strcasecmp ((char *)inputBuffer, "textarea")) ||
  2576. (HTMLcontext.parsingScript &&
  2577. strcasecmp ((char *)inputBuffer, "script")))
  2578. /* We are parsing the contents of a textarea or script element. The end
  2579. tag is not the one closing the current textarea or script, consider it
  2580. as plain text */
  2581. {
  2582. /* next state is state 0, not the state computed by the automaton */
  2583. NormalTransition = FALSE;
  2584. currentState = 0;
  2585. /* put "</" and the tag name in the input buffer */
  2586. for (i = LgBuffer; i > 0; i--)
  2587. inputBuffer[i + 1] = inputBuffer[i - 1];
  2588. LgBuffer += 2;
  2589. inputBuffer[0] = '<';
  2590. inputBuffer[1] = '/';
  2591. inputBuffer[LgBuffer] = EOS;
  2592. /* copy the input buffer into the document */
  2593. TextToDocument ();
  2594. }
  2595. else
  2596. {
  2597. /* is it the end of the current HTML fragment ? */
  2598. ok = FALSE;
  2599. if (HTMLrootClosingTag != EOS)
  2600. {
  2601. /* look for a colon in the element name (namespaces) and ignore the
  2602. prefix if there is one */
  2603. for (i = 0; i < LgBuffer && inputBuffer[i] != ':'; i++);
  2604. if (inputBuffer[i] == ':')
  2605. i++;
  2606. else
  2607. i = 0;
  2608. if (strcasecmp ((char *)&inputBuffer[i], (char *)HTMLrootClosingTag) == 0)
  2609. {
  2610. HTMLrootClosed = TRUE;
  2611. ok = TRUE;
  2612. }
  2613. }
  2614. //if (!strcasecmp ((char *)inputBuffer, "font"))
  2615. // printf ("font element\n");
  2616. profile = TtaGetDocumentProfile(HTMLcontext.doc);
  2617. if (!ok)
  2618. {
  2619. /* search the HTML tag in the mapping table */
  2620. schema = DocumentSSchema;
  2621. entry = MapGI ((char *)inputBuffer, &schema, HTMLcontext.doc);
  2622. if (entry < 0)
  2623. {
  2624. if (strlen ((char *)inputBuffer) > MaxMsgLength - 20)
  2625. inputBuffer[MaxMsgLength - 20] = EOS;
  2626. if (DocumentMeta[HTMLcontext.doc] &&
  2627. DocumentMeta[HTMLcontext.doc]->xmlformat)
  2628. {
  2629. snprintf (msgBuffer, MaxMsgLength,
  2630. "Invalid tag <%s> (removed when saving)", inputBuffer);
  2631. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2632. removed = TRUE;
  2633. }
  2634. else
  2635. {
  2636. snprintf (msgBuffer, MaxMsgLength,
  2637. "Warning - unknown tag </%s>", inputBuffer);
  2638. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2639. removed = FALSE;
  2640. }
  2641. /* create an Invalid_element */
  2642. snprintf (msgBuffer, MaxMsgLength, "</%s", inputBuffer);
  2643. InsertInvalidEl (msgBuffer, removed);
  2644. }
  2645. else if (entry >= 0 &&
  2646. profile != L_Other &&
  2647. !(pHTMLGIMapping[entry].Level & profile))
  2648. {
  2649. /* Invalid element for the document profile */
  2650. if (strlen ((char *)inputBuffer) > MaxMsgLength - 20)
  2651. inputBuffer[MaxMsgLength - 20] = EOS;
  2652. snprintf (msgBuffer, MaxMsgLength,
  2653. "Invalid end element <%s> for the document profile",
  2654. inputBuffer);
  2655. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2656. XMLErrorsFoundInProfile = TRUE;
  2657. }
  2658. else if (!CloseElement (entry, -1, FALSE))
  2659. /* the end tag does not close any current element */
  2660. {
  2661. if (DocumentMeta[HTMLcontext.doc] &&
  2662. DocumentMeta[HTMLcontext.doc]->xmlformat)
  2663. {
  2664. snprintf (msgBuffer, MaxMsgLength,
  2665. "Invalid end tag <%s>", inputBuffer);
  2666. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2667. }
  2668. else
  2669. {
  2670. /* try to recover */
  2671. if ((inputBuffer[0] == 'H' || inputBuffer[0] == 'h') &&
  2672. inputBuffer[1] >= '1' && inputBuffer[1] <= '6' &&
  2673. inputBuffer[2] == EOS)
  2674. /* the end tag is </Hn>. Consider all Hn as equivalent. */
  2675. /* </H3> is considered as an end tag for <H2>, for instance */
  2676. {
  2677. strcpy ((char *)msgBuffer, (char *)inputBuffer);
  2678. msgBuffer[1] = '1';
  2679. i = 1;
  2680. do
  2681. {
  2682. schema = DocumentSSchema;
  2683. entry = MapGI ((char *)msgBuffer, &schema, HTMLcontext.doc);
  2684. ok = CloseElement (entry, -1, FALSE);
  2685. msgBuffer[1]++;
  2686. i++;
  2687. }
  2688. while (i <= 6 && !ok);
  2689. }
  2690. if (!ok &&
  2691. (!strcasecmp ((char *)inputBuffer, "ol") ||
  2692. !strcasecmp ((char *)inputBuffer, "ul") ||
  2693. !strcasecmp ((char *)inputBuffer, "menu") ||
  2694. !strcasecmp ((char *)inputBuffer, "dir")))
  2695. /* the end tag is supposed to close a list */
  2696. /* try to close another type of list */
  2697. {
  2698. ok = TRUE;
  2699. schema = DocumentSSchema;
  2700. if (!CloseElement (MapGI ((char *)"ol", &schema, HTMLcontext.doc), -1, FALSE) &&
  2701. !CloseElement (MapGI ((char *)"ul", &schema, HTMLcontext.doc), -1, FALSE) &&
  2702. !CloseElement (MapGI ((char *)"menu", &schema, HTMLcontext.doc), -1, FALSE) &&
  2703. !CloseElement (MapGI ((char *)"dir", &schema, HTMLcontext.doc), -1, FALSE))
  2704. ok = FALSE;
  2705. }
  2706. if (!ok)
  2707. /* unrecoverable error. Create an Invalid_element */
  2708. {
  2709. snprintf (msgBuffer, MaxMsgLength, "</%s", inputBuffer);
  2710. InsertInvalidEl (msgBuffer, TRUE);
  2711. /* print an error message... */
  2712. snprintf (msgBuffer, MaxMsgLength,
  2713. "Invalid end tag </%s> (removed when saving)",
  2714. inputBuffer);
  2715. }
  2716. else
  2717. /* print an error message... */
  2718. snprintf (msgBuffer, MaxMsgLength,
  2719. "Warning - unexpected end tag </%s>",
  2720. inputBuffer);
  2721. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2722. }
  2723. }
  2724. }
  2725. InitBuffer ();
  2726. }
  2727. if (c == '<')
  2728. {
  2729. HTMLParseError (HTMLcontext.doc, "Syntax error", 0);
  2730. StartOfTag (c);
  2731. }
  2732. }
  2733. /*----------------------------------------------------------------------
  2734. EndOfAttrName A HTML attribute has been read. Create the
  2735. corresponding Thot attribute.
  2736. ----------------------------------------------------------------------*/
  2737. static void EndOfAttrName (char c)
  2738. {
  2739. AttributeMapping* tableEntry;
  2740. AttributeType attrType;
  2741. Element elText;
  2742. ElementType elType;
  2743. Attribute attr;
  2744. SSchema schema;
  2745. char translation;
  2746. char msgBuffer[MaxMsgLength];
  2747. ThotBool highEnoughLevel;
  2748. CloseBuffer ();
  2749. if (UnknownTag && HTMLcontext.lastElement)
  2750. {
  2751. elType = TtaGetElementType (HTMLcontext.lastElement);
  2752. if (elType.ElTypeNum == HTML_EL_Invalid_element ||
  2753. elType.ElTypeNum == HTML_EL_Unknown_namespace)
  2754. {
  2755. elText = TtaGetLastChild (HTMLcontext.lastElement);
  2756. TtaAppendTextContent (elText, (unsigned char *)" ", HTMLcontext.doc);
  2757. TtaAppendTextContent (elText, (unsigned char *)inputBuffer,
  2758. HTMLcontext.doc);
  2759. }
  2760. InitBuffer ();
  2761. lastAttrEntry = NULL;
  2762. return;
  2763. }
  2764. /* if a single '/' or '?' has been read instead of an attribute name, ignore
  2765. that character. This is to accept the XML syntax for empty elements or
  2766. processing instructions, such as <img src="SomeUrl" /> or
  2767. <?xml version="1.0"?> */
  2768. if (LgBuffer == 1 &&
  2769. (inputBuffer[0] == '/' || inputBuffer[0] == '?'))
  2770. {
  2771. InitBuffer ();
  2772. return;
  2773. }
  2774. highEnoughLevel = TRUE;
  2775. /* inputBuffer contains the attribute name */
  2776. /* get the corresponding Thot attribute */
  2777. if (UnknownTag)
  2778. /* ignore attributes of unknown tags */
  2779. tableEntry = NULL;
  2780. else
  2781. tableEntry = MapAttr ((char *)inputBuffer, &schema,
  2782. lastElemEntry, &highEnoughLevel, HTMLcontext.doc);
  2783. if (tableEntry)
  2784. /* this is a known attribute. Can it be associated with the current
  2785. element ? */
  2786. {
  2787. /* reject attribute height on a table */
  2788. if (tableEntry->ThotAttribute == HTML_ATTR_Height_)
  2789. {
  2790. elType = TtaGetElementType (HTMLcontext.lastElement);
  2791. if (elType.ElTypeNum == HTML_EL_Table_)
  2792. tableEntry = NULL;
  2793. }
  2794. else if (tableEntry->ThotAttribute == HTML_ATTR_xmlid)
  2795. {
  2796. snprintf (msgBuffer, MaxMsgLength,
  2797. "Invalid attribute \"%s\"(removed when saving)",
  2798. inputBuffer);
  2799. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2800. /* attach an Invalid_attribute to the current element */
  2801. tableEntry = &pHTMLAttributeMapping[0];
  2802. schema = DocumentSSchema;
  2803. UnknownAttr = TRUE;
  2804. }
  2805. }
  2806. if (!tableEntry)
  2807. {
  2808. if (highEnoughLevel)
  2809. {
  2810. /* this attribute is not in the HTML mapping table */
  2811. if (strcasecmp ((char *)inputBuffer, "xmlns") == 0 ||
  2812. strncasecmp ((char *)inputBuffer, "xmlns:", 6) == 0)
  2813. /* this is a namespace declaration */
  2814. {
  2815. lastAttrEntry = NULL;
  2816. /**** register this namespace ****/;
  2817. }
  2818. //else if (strcasecmp ((char *)inputBuffer, "xml:lang") == 0)
  2819. /* attribute xml:lang is not considered as invalid, but it is
  2820. ignored */
  2821. // lastAttrEntry = NULL;
  2822. else
  2823. {
  2824. snprintf (msgBuffer, MaxMsgLength,
  2825. "Invalid attribute \"%s\"(removed when saving)",
  2826. inputBuffer);
  2827. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2828. /* attach an Invalid_attribute to the current element */
  2829. tableEntry = &pHTMLAttributeMapping[0];
  2830. schema = DocumentSSchema;
  2831. UnknownAttr = TRUE;
  2832. }
  2833. }
  2834. else
  2835. {
  2836. /* attribute invalid for the document profile */
  2837. snprintf (msgBuffer, MaxMsgLength,
  2838. "Invalid attribute \"%s\" for the document profile",
  2839. inputBuffer);
  2840. HTMLParseError (HTMLcontext.doc, msgBuffer, 0);
  2841. XMLErrorsFoundInProfile = TRUE;
  2842. UnknownAttr = TRUE;
  2843. lastAttrEntry = NULL;
  2844. }
  2845. }
  2846. else
  2847. UnknownAttr = FALSE;
  2848. if (tableEntry != NULL && HTMLcontext.lastElement != NULL &&
  2849. (!HTMLcontext.lastElementClosed ||
  2850. (HTMLcontext.lastElement != rootElement)))
  2851. {
  2852. lastAttrEntry = tableEntry;
  2853. translation = lastAttrEntry->AttrOrContent;
  2854. switch (translation)
  2855. {
  2856. case 'C': /* Content */
  2857. /* Nothing to do yet: wait for attribute value */
  2858. break;
  2859. case 'A':
  2860. /* create an attribute for current element */
  2861. attrType.AttrSSchema = schema;
  2862. attrType.AttrTypeNum = tableEntry->ThotAttribute;
  2863. CreateHTMLAttribute (HTMLcontext.lastElement, attrType, (char *)inputBuffer,
  2864. (ThotBool)(tableEntry == &pHTMLAttributeMapping[0]),
  2865. HTMLcontext.doc, &lastAttribute, &lastAttrElement);
  2866. if (attrType.AttrTypeNum == HTML_ATTR_HREF_)
  2867. {
  2868. elType = TtaGetElementType (HTMLcontext.lastElement);
  2869. if (elType.ElTypeNum == HTML_EL_Anchor)
  2870. /* attribute HREF for element Anchor */
  2871. /* create attribute PseudoClass = link */
  2872. {
  2873. attrType.AttrTypeNum = HTML_ATTR_PseudoClass;
  2874. attr = TtaNewAttribute (attrType);
  2875. TtaAttachAttribute (HTMLcontext.lastElement, attr,
  2876. HTMLcontext.doc);
  2877. TtaSetAttributeText (attr, "link",
  2878. HTMLcontext.lastElement,
  2879. HTMLcontext.doc);
  2880. }
  2881. }
  2882. else if (attrType.AttrTypeNum == HTML_ATTR_Checked)
  2883. {
  2884. /* create Default-Checked attribute */
  2885. attrType.AttrSSchema = DocumentSSchema;
  2886. attrType.AttrTypeNum = HTML_ATTR_DefaultChecked;
  2887. attr = TtaNewAttribute (attrType);
  2888. TtaAttachAttribute (HTMLcontext.lastElement, attr,
  2889. HTMLcontext.doc);
  2890. TtaSetAttributeValue (attr, HTML_ATTR_DefaultChecked_VAL_Yes_,
  2891. HTMLcontext.lastElement, HTMLcontext.doc);
  2892. }
  2893. else if (attrType.AttrTypeNum == HTML_ATTR_Selected)
  2894. {
  2895. /* create Default-Selected attribute */
  2896. attrType.AttrSSchema = DocumentSSchema;
  2897. attrType.AttrTypeNum = HTML_ATTR_DefaultSelected;
  2898. attr = TtaNewAttribute (attrType);
  2899. TtaAttachAttribute (HTMLcontext.lastElement, attr,
  2900. HTMLcontext.doc);
  2901. TtaSetAttributeValue (attr, HTML_ATTR_DefaultSelected_VAL_Yes_,
  2902. HTMLcontext.lastElement, HTMLcontext.doc);
  2903. }
  2904. break;
  2905. case SPACE:
  2906. /* nothing to do */
  2907. break;
  2908. default:
  2909. break;
  2910. }
  2911. }
  2912. InitBuffer ();
  2913. }
  2914. /*----------------------------------------------------------------------
  2915. EndOfAttrNameAndTag A ">" has been read. It indicates the
  2916. end of an attribute name and the end of a start tag.
  2917. ----------------------------------------------------------------------*/
  2918. static void EndOfAttrNameAndTag (char c)
  2919. {
  2920. EndOfAttrName (c);
  2921. EndOfStartTag (c);
  2922. if (c == '<')
  2923. {
  2924. HTMLParseError (HTMLcontext.doc, "Syntax error", 0);
  2925. StartOfTag (c);
  2926. }
  2927. }
  2928. /*----------------------------------------------------------------------
  2929. StartOfQuotedAttrValue
  2930. A quote (or double quote) starting an attribute value has been read.
  2931. ----------------------------------------------------------------------*/
  2932. static void StartOfQuotedAttrValue (char c)
  2933. {
  2934. ReadingAnAttrValue = TRUE;
  2935. if (UnknownAttr)
  2936. /* this is the value of an unknown attribute. keep the quote */
  2937. /* in the input buffer for copying it in the current */
  2938. /* Invalid_attribute */
  2939. PutInBuffer (c);
  2940. }
  2941. /*----------------------------------------------------------------------
  2942. StartOfUnquotedAttrValue
  2943. The first character of an unquoted attribute value has been read.
  2944. ----------------------------------------------------------------------*/
  2945. static void StartOfUnquotedAttrValue (char c)
  2946. {
  2947. ReadingAnAttrValue = TRUE;
  2948. PutInBuffer (c);
  2949. }
  2950. static ThotBool isAttrValueTruncated;
  2951. /*----------------------------------------------------------------------
  2952. EndOfAttrValue
  2953. An attribute value has been read from the HTML file.
  2954. Put that value in the current Thot attribute.
  2955. ----------------------------------------------------------------------*/
  2956. static void EndOfAttrValue (char c)
  2957. {
  2958. Element elText;
  2959. ElementType elType;
  2960. char *newBufferAttrValue;
  2961. int lg;
  2962. if (TruncatedAttrValue)
  2963. {
  2964. isAttrValueTruncated = TRUE;
  2965. if (BufferAttrValue == NULL)
  2966. {
  2967. lg = 2 * MaxBufferLength;
  2968. BufferAttrValue = (char*)TtaGetMemory (lg + 1);
  2969. strcpy ((char *)BufferAttrValue, (char *)inputBuffer);
  2970. LgBufferAttrValue = lg;
  2971. }
  2972. else
  2973. {
  2974. LgBufferAttrValue += MaxBufferLength;
  2975. newBufferAttrValue = (char*)TtaGetMemory (LgBufferAttrValue + 1);
  2976. strcpy ((char *)newBufferAttrValue, (char *)BufferAttrValue);
  2977. strcat ((char *)newBufferAttrValue, (char *)inputBuffer);
  2978. TtaFreeMemory (BufferAttrValue);
  2979. BufferAttrValue = newBufferAttrValue;
  2980. }
  2981. }
  2982. else
  2983. {
  2984. ReadingAnAttrValue = FALSE;
  2985. if (UnknownAttr)
  2986. /* this is the end of value of an invalid attribute. Keep the */
  2987. /* quote character that ends the value for copying it into the */
  2988. /* Invalid_attribute. */
  2989. if (c == '\'' || c == '\"')
  2990. PutInBuffer (c);
  2991. CloseBuffer ();
  2992. /* inputBuffer contains the attribute value */
  2993. if (UnknownTag && HTMLcontext.lastElement)
  2994. {
  2995. elType = TtaGetElementType (HTMLcontext.lastElement);
  2996. if (elType.ElTypeNum == HTML_EL_Invalid_element ||
  2997. elType.ElTypeNum == HTML_EL_Unknown_namespace)
  2998. {
  2999. elText = TtaGetLastChild (HTMLcontext.lastElement);
  3000. TtaAppendTextContent (elText, (unsigned char *)"=",
  3001. HTMLcontext.doc);
  3002. TtaAppendTextContent (elText, (unsigned char *)inputBuffer,
  3003. HTMLcontext.doc);
  3004. }
  3005. InitBuffer ();
  3006. lastAttrEntry = NULL;
  3007. return;
  3008. }
  3009. if (lastAttrEntry == NULL)
  3010. {
  3011. InitBuffer ();
  3012. return;
  3013. }
  3014. if (HTMLcontext.lastElementClosed &&
  3015. (HTMLcontext.lastElement == rootElement))
  3016. {
  3017. /* an attribute after the tag </html>, ignore it */
  3018. }
  3019. else
  3020. {
  3021. if (isAttrValueTruncated)
  3022. {
  3023. strcat ((char *)BufferAttrValue, (char *)inputBuffer);
  3024. EndOfHTMLAttributeValue (BufferAttrValue, lastAttrEntry,
  3025. lastAttribute, lastAttrElement,
  3026. UnknownAttr, &HTMLcontext,
  3027. FALSE/*HTML parser*/);
  3028. TtaFreeMemory (BufferAttrValue);
  3029. BufferAttrValue = NULL;
  3030. LgBufferAttrValue = 0;
  3031. isAttrValueTruncated = FALSE;
  3032. }
  3033. else
  3034. {
  3035. EndOfHTMLAttributeValue ((char *)inputBuffer, lastAttrEntry, lastAttribute,
  3036. lastAttrElement, UnknownAttr, &HTMLcontext,
  3037. FALSE/*HTML parser*/);
  3038. }
  3039. }
  3040. InitBuffer ();
  3041. }
  3042. }
  3043. /*----------------------------------------------------------------------
  3044. EndOfAttrValueAndTag A ">" has been read. It indicates the
  3045. end of an attribute value and the end of a start tag.
  3046. ----------------------------------------------------------------------*/
  3047. static void EndOfAttrValueAndTag (char c)
  3048. {
  3049. EndOfAttrValue (c);
  3050. EndOfStartTag (c);
  3051. }
  3052. /*----------------------------------------------------------------------
  3053. StartOfEntity A character '&' has been encountered in the text.
  3054. ----------------------------------------------------------------------*/
  3055. static void StartOfEntity (char c)
  3056. {
  3057. LgEntityName = 0;
  3058. EntityTableEntry = 0;
  3059. CharRank = 0;
  3060. }
  3061. /*----------------------------------------------------------------------
  3062. GetFallbackCharacter
  3063. Parameter lang gives the language of the enclosing element.
  3064. Returns the fallback string and the language.
  3065. ----------------------------------------------------------------------*/
  3066. void GetFallbackCharacter (int code, unsigned char *fallback, Language *lang)
  3067. {
  3068. unsigned char *ptr;
  3069. int i;
  3070. fallback[0] = EOS;
  3071. fallback[1] = EOS;
  3072. fallback[2] = EOS;
  3073. /* get the UTF-8 string of the unicode character */
  3074. ptr = fallback;
  3075. i = TtaWCToMBstring ((wchar_t) code, &ptr);
  3076. fallback[i] = EOS;
  3077. }
  3078. #ifdef LC
  3079. /*----------------------------------------------------------------------
  3080. PutAmpersandInDoc
  3081. Put an '&' character in the document tree with an attribute
  3082. IntEntity.
  3083. ----------------------------------------------------------------------*/
  3084. static void PutAmpersandInDoc ()
  3085. {
  3086. ElementType elType;
  3087. Element elText;
  3088. AttributeType attrType;
  3089. Attribute attr;
  3090. TextToDocument ();
  3091. /* create a TEXT element for '&'*/
  3092. elType.ElSSchema = DocumentSSchema;
  3093. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3094. elText = TtaNewElement (HTMLcontext.doc, elType);
  3095. TtaSetElementLineNumber (elText, NumberOfLinesRead);
  3096. InsertElement (&elText);
  3097. HTMLcontext.lastElementClosed = TRUE;
  3098. HTMLcontext.mergeText = FALSE;
  3099. TtaSetTextContent (elText, (unsigned char *)"&", HTMLcontext.language, HTMLcontext.doc);
  3100. attrType.AttrSSchema = DocumentSSchema;
  3101. attrType.AttrTypeNum = HTML_ATTR_IntEntity;
  3102. attr = TtaNewAttribute (attrType);
  3103. TtaAttachAttribute (elText, attr, HTMLcontext.doc);
  3104. TtaSetAttributeValue (attr, HTML_ATTR_IntEntity_VAL_Yes_, elText, HTMLcontext.doc);
  3105. }
  3106. #endif /* LC */
  3107. /*----------------------------------------------------------------------
  3108. EndOfEntity End of a HTML entity. Search that entity in the
  3109. entity table and put the corresponding character in the input buffer.
  3110. ----------------------------------------------------------------------*/
  3111. static void EndOfEntity (unsigned char c)
  3112. {
  3113. unsigned char fallback[7], *ptr;
  3114. int len;
  3115. int i;
  3116. unsigned char msgBuffer[MaxMsgLength];
  3117. EntityName[LgEntityName] = EOS;
  3118. if (XhtmlEntityTable[EntityTableEntry].charName[CharRank] == EOS)
  3119. {
  3120. /* the entity read matches the current entry of entity table */
  3121. if (XhtmlEntityTable[EntityTableEntry].charCode > 127)
  3122. {
  3123. /* generate the UTF-8 string */
  3124. ptr = fallback;
  3125. len = TtaWCToMBstring ((wchar_t) (XhtmlEntityTable[EntityTableEntry].charCode), &ptr);
  3126. for (i = 0; i < len; i++)
  3127. PutInBuffer (fallback[i]);
  3128. }
  3129. else
  3130. PutInBuffer ((char)XhtmlEntityTable[EntityTableEntry].charCode);
  3131. }
  3132. else
  3133. {
  3134. /* entity not in the table. Print an error message */
  3135. PutInBuffer ('&');
  3136. for (i = 0; i < LgEntityName; i++)
  3137. PutInBuffer (EntityName[i]);
  3138. PutInBuffer (';');
  3139. /* print an error message */
  3140. sprintf ((char *)msgBuffer, "Unknown entity");
  3141. HTMLParseError (HTMLcontext.doc, (char *)msgBuffer, 0);
  3142. }
  3143. LgEntityName = 0;
  3144. }
  3145. /*----------------------------------------------------------------------
  3146. EntityChar A character belonging to a HTML entity has been
  3147. read.
  3148. ----------------------------------------------------------------------*/
  3149. static void EntityChar (unsigned char c)
  3150. {
  3151. unsigned char fallback[7], *ptr;
  3152. int len;
  3153. unsigned char msgBuffer[MaxMsgLength];
  3154. int i;
  3155. ThotBool OK, done, stop;
  3156. done = FALSE;
  3157. if (XhtmlEntityTable[EntityTableEntry].charName[CharRank] == EOS)
  3158. /* the entity name read so far matches the current entry of */
  3159. /* entity table */
  3160. /* does it also match the next entry? */
  3161. {
  3162. OK = FALSE;
  3163. i = EntityTableEntry+1;
  3164. stop = FALSE;
  3165. do
  3166. {
  3167. if (strncmp (EntityName, XhtmlEntityTable[i].charName, LgEntityName) != 0)
  3168. stop = TRUE;
  3169. else if (XhtmlEntityTable[i].charName[CharRank] < c)
  3170. i++;
  3171. else
  3172. {
  3173. stop = TRUE;
  3174. if (XhtmlEntityTable[i].charName[CharRank] == c)
  3175. OK = TRUE;
  3176. }
  3177. }
  3178. while (!stop);
  3179. if (!OK &&
  3180. (c == SPACE || c == EOL || c == TAB || c == __CR__))
  3181. {
  3182. /* If we are not reading an attribute value, assume that semicolon is
  3183. missing and put the corresponding char in the document content */
  3184. EntityName[LgEntityName] = EOS;
  3185. if (XhtmlEntityTable[EntityTableEntry].charCode > 127)
  3186. {
  3187. /* generate the UTF-8 string */
  3188. ptr = fallback;
  3189. len = TtaWCToMBstring ((wchar_t) (XhtmlEntityTable[EntityTableEntry].charCode), &ptr);
  3190. for (i = 0; i < len; i++)
  3191. PutInBuffer (fallback[i]);
  3192. }
  3193. else
  3194. PutInBuffer ((char)(XhtmlEntityTable[EntityTableEntry].charCode));
  3195. if (c != SPACE)
  3196. /* print an error message */
  3197. HTMLParseError (HTMLcontext.doc, "Missing semicolon", 0);
  3198. /* next state is the return state from the entity subautomaton, not
  3199. the state computed by the automaton. In addition the character read
  3200. has not been processed yet */
  3201. NormalTransition = FALSE;
  3202. currentState = returnState;
  3203. /* end of entity */
  3204. LgEntityName = 0;
  3205. done = TRUE;
  3206. }
  3207. }
  3208. if (!done)
  3209. {
  3210. while (XhtmlEntityTable[EntityTableEntry].charName[CharRank] < c
  3211. && XhtmlEntityTable[EntityTableEntry].charCode != 0)
  3212. EntityTableEntry++;
  3213. if (XhtmlEntityTable[EntityTableEntry].charName[CharRank] != c)
  3214. OK = FALSE;
  3215. else
  3216. {
  3217. if (LgEntityName > 0 &&
  3218. strncmp (EntityName,
  3219. XhtmlEntityTable[EntityTableEntry].charName,
  3220. LgEntityName) != 0)
  3221. OK = FALSE;
  3222. else
  3223. {
  3224. OK = TRUE;
  3225. CharRank++;
  3226. if (LgEntityName < MaxEntityLength - 1)
  3227. EntityName[LgEntityName++] = c;
  3228. }
  3229. }
  3230. if (!OK)
  3231. {
  3232. /* the entity name read so far is not in the table */
  3233. /* invalid entity */
  3234. /* put the entity name in the buffer */
  3235. PutInBuffer ('&');
  3236. for (i = 0; i < LgEntityName; i++)
  3237. PutInBuffer (EntityName[i]);
  3238. /* print an error message only if it's not the first character
  3239. after '&' or if it is a letter */
  3240. if (LgEntityName > 0 ||
  3241. ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
  3242. {
  3243. /* print an error message */
  3244. EntityName[LgEntityName++] = c;
  3245. EntityName[LgEntityName++] = EOS;
  3246. sprintf ((char *)msgBuffer, "Unknown entity");
  3247. HTMLParseError (HTMLcontext.doc, (char *)msgBuffer, 0);
  3248. }
  3249. /* next state is the return state from the entity subautomaton,
  3250. not the state computed by the automaton.
  3251. In addition the character read has not been processed yet */
  3252. NormalTransition = FALSE;
  3253. currentState = returnState;
  3254. /* end of entity */
  3255. LgEntityName = 0;
  3256. }
  3257. }
  3258. }
  3259. /*----------------------------------------------------------------------
  3260. EndOfDecEntity End of a decimal entity. Convert the
  3261. string read into a number and put the character
  3262. having that code in the input buffer.
  3263. ----------------------------------------------------------------------*/
  3264. static void EndOfDecEntity (unsigned char c)
  3265. {
  3266. unsigned char fallback[7], *ptr;
  3267. int len;
  3268. int code;
  3269. int i;
  3270. EntityName[LgEntityName] = EOS;
  3271. sscanf (EntityName, "%d", &code);
  3272. if (code > 127)
  3273. {
  3274. /* generate the UTF-8 string */
  3275. ptr = fallback;
  3276. len = TtaWCToMBstring ((wchar_t) code, &ptr);
  3277. for (i = 0; i < len; i++)
  3278. PutInBuffer (fallback[i]);
  3279. }
  3280. else
  3281. PutInBuffer ((char) code);
  3282. LgEntityName = 0;
  3283. }
  3284. /*----------------------------------------------------------------------
  3285. DecEntityChar A character belonging to a decimal entity has been read.
  3286. Put that character in the entity buffer.
  3287. ----------------------------------------------------------------------*/
  3288. static void DecEntityChar (unsigned char c)
  3289. {
  3290. int i;
  3291. if (LgEntityName < MaxEntityLength - 1)
  3292. {
  3293. /* the entity buffer is not full */
  3294. if (c >= '0' && c <= '9')
  3295. /* the character is a decimal digit */
  3296. EntityName[LgEntityName++] = c;
  3297. else
  3298. /* not a decimal digit. assume end of entity */
  3299. {
  3300. if (c == '<')
  3301. /* accept start of tag as an end of entity */
  3302. EndOfDecEntity (c);
  3303. else
  3304. {
  3305. PutInBuffer ('&');
  3306. PutInBuffer ('#');
  3307. for (i = 0; i < LgEntityName; i++)
  3308. PutInBuffer (EntityName[i]);
  3309. LgEntityName = 0;
  3310. /* error message */
  3311. HTMLParseError (HTMLcontext.doc, "Invalid decimal entity", 0);
  3312. }
  3313. /* next state is state 0, not the state computed by the automaton */
  3314. /* and the character read has not been processed yet */
  3315. NormalTransition = FALSE;
  3316. currentState = 0;
  3317. }
  3318. }
  3319. }
  3320. /*----------------------------------------------------------------------
  3321. EndOfHexEntity End of an hexadecimal entity. Convert the
  3322. string read into a number and put the character
  3323. having that code in the input buffer.
  3324. ----------------------------------------------------------------------*/
  3325. static void EndOfHexEntity (unsigned char c)
  3326. {
  3327. unsigned char fallback[7], *ptr;
  3328. int len;
  3329. int code;
  3330. int i;
  3331. EntityName[LgEntityName] = EOS;
  3332. sscanf (EntityName, "%x", &code);
  3333. if (code > 127)
  3334. {
  3335. /* generate the UTF-8 string */
  3336. ptr = fallback;
  3337. len = TtaWCToMBstring ((wchar_t) code, &ptr);
  3338. for (i = 0; i < len; i++)
  3339. PutInBuffer (fallback[i]);
  3340. }
  3341. else
  3342. PutInBuffer ((char) code);
  3343. LgEntityName = 0;
  3344. }
  3345. /*----------------------------------------------------------------------
  3346. HexEntityChar A character belonging to an hexadecimal entity has been
  3347. read. Put that character in the entity buffer.
  3348. ----------------------------------------------------------------------*/
  3349. static void HexEntityChar (char c)
  3350. {
  3351. int i;
  3352. if (LgEntityName < MaxEntityLength - 1)
  3353. {
  3354. /* the entity buffer is not full */
  3355. if ((c >= '0' && c <= '9') ||
  3356. (c >= 'a' && c <= 'f') ||
  3357. (c >= 'A' && c <= 'F'))
  3358. /* the character is a valid hexadecimal digit */
  3359. EntityName[LgEntityName++] = c;
  3360. else
  3361. {
  3362. /* not an hexadecimal digit. Assume end of entity */
  3363. if (c == '<')
  3364. /* accept start of tag as the end of the entity */
  3365. EndOfHexEntity (c);
  3366. else
  3367. /* error */
  3368. {
  3369. PutInBuffer ('&');
  3370. PutInBuffer ('#');
  3371. PutInBuffer ('x');
  3372. for (i = 0; i < LgEntityName; i++)
  3373. PutInBuffer (EntityName[i]);
  3374. LgEntityName = 0;
  3375. /* error message */
  3376. HTMLParseError (HTMLcontext.doc, "Invalid hexadecimal entity", 0);
  3377. }
  3378. /* next state is state 0, not the state computed by the automaton */
  3379. /* and the character read has not been processed yet */
  3380. NormalTransition = FALSE;
  3381. currentState = 0;
  3382. }
  3383. }
  3384. }
  3385. /*----------------------------------------------------------------------
  3386. EndOfDocument End of the HTML file. Terminate the Thot
  3387. document
  3388. ----------------------------------------------------------------------*/
  3389. static void EndOfDocument ()
  3390. {
  3391. if (LgBuffer > 0)
  3392. TextToDocument ();
  3393. }
  3394. /*----------------------------------------------------------------------
  3395. PutLess put '<' in the input buffer
  3396. ----------------------------------------------------------------------*/
  3397. static void PutLess (char c)
  3398. {
  3399. PutInBuffer ('<');
  3400. }
  3401. /*----------------------------------------------------------------------
  3402. PutAmpersandSpace put '& ' in the input buffer.
  3403. ----------------------------------------------------------------------*/
  3404. static void PutAmpersandSpace (char c)
  3405. {
  3406. PutInBuffer ('&');
  3407. PutInBuffer (SPACE);
  3408. }
  3409. /*----------------------------------------------------------------------
  3410. PutLessAndSpace put '<' and the space read in the input buffer.
  3411. ----------------------------------------------------------------------*/
  3412. static void PutLessAndSpace (char c)
  3413. {
  3414. PutInBuffer ('<');
  3415. PutInBuffer (c);
  3416. }
  3417. /*----------------------------------------------------------------------
  3418. StartOfComment Beginning of a HTML comment.
  3419. ----------------------------------------------------------------------*/
  3420. static void StartOfComment (char c)
  3421. {
  3422. ElementType elType;
  3423. Element elComment, elCommentLine;
  3424. /* create a Thot element Comment */
  3425. elType.ElSSchema = DocumentSSchema;
  3426. elType.ElTypeNum = HTML_EL_Comment_;
  3427. elComment = TtaNewElement (HTMLcontext.doc, elType);
  3428. TtaSetElementLineNumber (elComment, NumberOfLinesRead);
  3429. InsertElement (&elComment);
  3430. /* create a Comment_line element as the first child of */
  3431. /* element Comment */
  3432. if (elComment != NULL)
  3433. {
  3434. elType.ElTypeNum = HTML_EL_Comment_line;
  3435. elCommentLine = TtaNewElement (HTMLcontext.doc, elType);
  3436. TtaSetElementLineNumber (elCommentLine, NumberOfLinesRead);
  3437. TtaInsertFirstChild (&elCommentLine, elComment, HTMLcontext.doc);
  3438. /* create a TEXT element as the first child of element Comment_line */
  3439. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3440. CommentText = TtaNewElement (HTMLcontext.doc, elType);
  3441. TtaSetElementLineNumber (CommentText, NumberOfLinesRead);
  3442. TtaInsertFirstChild (&CommentText, elCommentLine, HTMLcontext.doc);
  3443. TtaSetTextContent (CommentText, (unsigned char *)"", HTMLcontext.language,
  3444. HTMLcontext.doc);
  3445. }
  3446. InitBuffer ();
  3447. }
  3448. /*----------------------------------------------------------------------
  3449. PutInComment put character c in the current HTML comment.
  3450. ----------------------------------------------------------------------*/
  3451. static void PutInComment (unsigned char c)
  3452. {
  3453. ElementType elType;
  3454. Element elCommentLine, prevElCommentLine;
  3455. if (c != EOS)
  3456. {
  3457. if (!HTMLcontext.parsingCSS && ((int) c == EOL || (int) c == __CR__))
  3458. /* new line in a comment */
  3459. {
  3460. /* put the content of the inputBuffer into the current */
  3461. /* Comment_line element */
  3462. CloseBuffer ();
  3463. TtaAppendTextContent (CommentText, (unsigned char *)inputBuffer, HTMLcontext.doc);
  3464. InitBuffer ();
  3465. /* create a new Comment_line element */
  3466. elType.ElSSchema = DocumentSSchema;
  3467. elType.ElTypeNum = HTML_EL_Comment_line;
  3468. elCommentLine = TtaNewElement (HTMLcontext.doc, elType);
  3469. TtaSetElementLineNumber (elCommentLine, NumberOfLinesRead);
  3470. /* inserts the new Comment_line element after the previous one */
  3471. prevElCommentLine = TtaGetParent (CommentText);
  3472. TtaInsertSibling (elCommentLine, prevElCommentLine, FALSE, HTMLcontext.doc);
  3473. /* create a TEXT element as the first child of the new element
  3474. Comment_line */
  3475. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3476. CommentText = TtaNewElement (HTMLcontext.doc, elType);
  3477. TtaSetElementLineNumber (CommentText, NumberOfLinesRead);
  3478. TtaInsertFirstChild (&CommentText, elCommentLine, HTMLcontext.doc);
  3479. TtaSetTextContent (CommentText, (unsigned char *)"", HTMLcontext.language, HTMLcontext.doc);
  3480. }
  3481. else
  3482. {
  3483. if (LgBuffer >= MaxBufferLength - 1)
  3484. {
  3485. CloseBuffer ();
  3486. TtaAppendTextContent (CommentText, (unsigned char *)inputBuffer,
  3487. HTMLcontext.doc);
  3488. InitBuffer ();
  3489. }
  3490. inputBuffer[LgBuffer++] = c;
  3491. }
  3492. }
  3493. }
  3494. /*----------------------------------------------------------------------
  3495. EndOfComment End of a HTML comment.
  3496. ----------------------------------------------------------------------*/
  3497. static void EndOfComment (char c)
  3498. {
  3499. if (LgBuffer > 0)
  3500. {
  3501. CloseBuffer ();
  3502. if (CommentText != NULL)
  3503. TtaAppendTextContent (CommentText, (unsigned char *)inputBuffer,
  3504. HTMLcontext.doc);
  3505. }
  3506. CommentText = NULL;
  3507. HTMLcontext.lastElementClosed = TRUE;
  3508. InitBuffer ();
  3509. }
  3510. /*----------------------------------------------------------------------
  3511. StartOfASP Beginning of a HTML ASP
  3512. ----------------------------------------------------------------------*/
  3513. static void StartOfASP (char c)
  3514. {
  3515. ElementType elType;
  3516. Element elASP, elASPLine;
  3517. /* create a Thot element ASP */
  3518. elType.ElSSchema = DocumentSSchema;
  3519. elType.ElTypeNum = HTML_EL_ASP_element;
  3520. elASP = TtaNewElement (HTMLcontext.doc, elType);
  3521. TtaSetElementLineNumber (elASP, NumberOfLinesRead);
  3522. InsertElement (&elASP);
  3523. /* create a ASP_line element as the first child of */
  3524. /* element ASP */
  3525. if (elASP != NULL)
  3526. {
  3527. elType.ElTypeNum = HTML_EL_ASP_line;
  3528. elASPLine = TtaNewElement (HTMLcontext.doc, elType);
  3529. TtaSetElementLineNumber (elASPLine, NumberOfLinesRead);
  3530. TtaInsertFirstChild (&elASPLine, elASP, HTMLcontext.doc);
  3531. /* create a TEXT element as the first child of element ASP_line */
  3532. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3533. ASPText = TtaNewElement (HTMLcontext.doc, elType);
  3534. TtaSetElementLineNumber (ASPText, NumberOfLinesRead);
  3535. TtaInsertFirstChild (&ASPText, elASPLine, HTMLcontext.doc);
  3536. TtaSetTextContent (ASPText, (unsigned char *)"", HTMLcontext.language,
  3537. HTMLcontext.doc);
  3538. }
  3539. InitBuffer ();
  3540. }
  3541. /*----------------------------------------------------------------------
  3542. PutInASP put character c in the current HTML ASP
  3543. ----------------------------------------------------------------------*/
  3544. static void PutInASP (unsigned char c)
  3545. {
  3546. ElementType elType;
  3547. Element elASPLine, prevElASPLine;
  3548. if (c != EOS)
  3549. {
  3550. if (!HTMLcontext.parsingCSS && ((int) c == EOL || (int) c == __CR__))
  3551. /* new line in a ASP */
  3552. {
  3553. /* put the content of the inputBuffer into the current */
  3554. /* ASP_line element */
  3555. CloseBuffer ();
  3556. TtaAppendTextContent (ASPText, (unsigned char *)inputBuffer, HTMLcontext.doc);
  3557. InitBuffer ();
  3558. /* create a new ASP_line element */
  3559. elType.ElSSchema = DocumentSSchema;
  3560. elType.ElTypeNum = HTML_EL_ASP_line;
  3561. elASPLine = TtaNewElement (HTMLcontext.doc, elType);
  3562. TtaSetElementLineNumber (elASPLine, NumberOfLinesRead);
  3563. /* inserts the new ASP_line element after the previous one */
  3564. prevElASPLine = TtaGetParent (ASPText);
  3565. TtaInsertSibling (elASPLine, prevElASPLine, FALSE, HTMLcontext.doc);
  3566. /* create a TEXT element as the first child of the new element
  3567. ASP_line */
  3568. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3569. ASPText = TtaNewElement (HTMLcontext.doc, elType);
  3570. TtaSetElementLineNumber (ASPText, NumberOfLinesRead);
  3571. TtaInsertFirstChild (&ASPText, elASPLine, HTMLcontext.doc);
  3572. TtaSetTextContent (ASPText, (unsigned char *)"", HTMLcontext.language, HTMLcontext.doc);
  3573. }
  3574. else
  3575. {
  3576. if (LgBuffer >= MaxBufferLength - 1)
  3577. {
  3578. CloseBuffer ();
  3579. TtaAppendTextContent (ASPText, (unsigned char *)inputBuffer,
  3580. HTMLcontext.doc);
  3581. InitBuffer ();
  3582. }
  3583. inputBuffer[LgBuffer++] = c;
  3584. }
  3585. }
  3586. }
  3587. /*----------------------------------------------------------------------
  3588. EndOfASP End of a HTML ASP
  3589. ----------------------------------------------------------------------*/
  3590. static void EndOfASP (char c)
  3591. {
  3592. if (LgBuffer > 0)
  3593. {
  3594. CloseBuffer ();
  3595. if (ASPText != NULL)
  3596. TtaAppendTextContent (ASPText, (unsigned char *)inputBuffer,
  3597. HTMLcontext.doc);
  3598. }
  3599. ASPText = NULL;
  3600. HTMLcontext.lastElementClosed = TRUE;
  3601. InitBuffer ();
  3602. }
  3603. /*----------------------------------------------------------------------
  3604. PutDash put a dash character in the current comment.
  3605. ----------------------------------------------------------------------*/
  3606. static void PutDash (char c)
  3607. {
  3608. PutInComment ('-');
  3609. PutInComment (c);
  3610. }
  3611. /*----------------------------------------------------------------------
  3612. PutDashDash put 2 dash characters in the current comment.
  3613. ----------------------------------------------------------------------*/
  3614. static void PutDashDash (char c)
  3615. {
  3616. PutInComment ('-');
  3617. PutInComment ('-');
  3618. PutInComment (c);
  3619. }
  3620. /*----------------------------------------------------------------------
  3621. PutQuestionMark put a question mark in the current PI.
  3622. ----------------------------------------------------------------------*/
  3623. static void PutQuestionMark (char c)
  3624. {
  3625. PutInBuffer ('?');
  3626. PutInBuffer (c);
  3627. }
  3628. /*----------------------------------------------------------------------
  3629. EndOfDoctypeDecl A Doctype declaration has been read
  3630. ----------------------------------------------------------------------*/
  3631. static void EndOfDoctypeDecl (char c)
  3632. {
  3633. int i, j;
  3634. Element docEl, text, doctype, prev, doctypeLine;
  3635. ElementType elType;
  3636. unsigned char *buffer;
  3637. CloseBuffer ();
  3638. buffer = (unsigned char*)TtaGetMemory (strlen ((char *)inputBuffer) + 20);
  3639. strcpy ((char *)buffer, (char *)"<!DOCTYPE ");
  3640. j = strlen ((char *)buffer);
  3641. /* process the Doctype declaration available in inputBuffer */
  3642. if (!strncasecmp ((char *)inputBuffer, "doctype", 7))
  3643. {
  3644. for (i = 7; inputBuffer[i] <= SPACE && inputBuffer[i] != EOS; i++);
  3645. if (!strncasecmp ((char *)&inputBuffer[i], "HTML", 4))
  3646. /* it's a HTML document */
  3647. {
  3648. docEl = TtaGetMainRoot (HTMLcontext.doc);
  3649. elType = TtaGetElementType (docEl);
  3650. /* Create a DOCTYPE element */
  3651. elType.ElTypeNum = HTML_EL_DOCTYPE;
  3652. doctype = TtaNewElement (HTMLcontext.doc, elType);
  3653. TtaSetElementLineNumber (doctype, NumberOfLinesRead);
  3654. InsertElement (&doctype);
  3655. /* Make the DOCTYPE element read-only */
  3656. TtaSetAccessRight (doctype, ReadOnly, HTMLcontext.doc);
  3657. HTMLcontext.lastElement = doctype;
  3658. HTMLcontext.lastElementClosed = TRUE;
  3659. /* Create a DOCTYPE_line element as first child */
  3660. elType.ElTypeNum = HTML_EL_DOCTYPE_line;
  3661. doctypeLine = TtaNewElement (HTMLcontext.doc, elType);
  3662. TtaSetElementLineNumber (doctypeLine, NumberOfLinesRead);
  3663. TtaInsertFirstChild (&doctypeLine, doctype, HTMLcontext.doc);
  3664. /* Look for line breaks in the input buffer and create */
  3665. /* as many DOCTYPE_line elements as needed */
  3666. while (inputBuffer[i] != EOS)
  3667. {
  3668. if (inputBuffer[i] != EOL && inputBuffer[i] != __CR__)
  3669. buffer[j++] = inputBuffer[i];
  3670. else
  3671. {
  3672. buffer[j] = EOS;
  3673. j = 0;
  3674. elType.ElTypeNum = 1;
  3675. text = TtaNewElement (HTMLcontext.doc, elType);
  3676. if (text != NULL)
  3677. {
  3678. TtaSetElementLineNumber (text, NumberOfLinesRead);
  3679. /* get the position of the Doctype text */
  3680. TtaInsertFirstChild (&text, doctypeLine, HTMLcontext.doc);
  3681. /* We use the Latin_Script language to avoid the spell_checker */
  3682. /* the spell_chekcer to check the doctype */
  3683. TtaSetTextContent (text, (unsigned char *)buffer, Latin_Script, HTMLcontext.doc);
  3684. }
  3685. /* Create a new DOCTYPE_line element */
  3686. elType.ElTypeNum = HTML_EL_DOCTYPE_line;
  3687. prev = doctypeLine;
  3688. doctypeLine = TtaNewElement (HTMLcontext.doc, elType);
  3689. if (doctypeLine != NULL)
  3690. {
  3691. TtaSetElementLineNumber (doctypeLine, NumberOfLinesRead);
  3692. TtaInsertSibling (doctypeLine, prev, FALSE, HTMLcontext.doc);
  3693. }
  3694. }
  3695. i++;
  3696. }
  3697. buffer [j++] = '>';
  3698. buffer [j] = EOS;
  3699. elType.ElTypeNum = 1;
  3700. text = TtaNewElement (HTMLcontext.doc, elType);
  3701. if (text)
  3702. {
  3703. TtaSetElementLineNumber (text, NumberOfLinesRead);
  3704. /* get the position of the Doctype text */
  3705. TtaInsertFirstChild (&text, doctypeLine, HTMLcontext.doc);
  3706. /* We use the Latin_Script language to avoid the spell_chekcer */
  3707. /* the spell_chekcer to check the doctype */
  3708. TtaSetTextContent (text, (unsigned char *)buffer, Latin_Script, HTMLcontext.doc);
  3709. }
  3710. }
  3711. }
  3712. InitBuffer ();
  3713. TtaFreeMemory (buffer);
  3714. }
  3715. /*----------------------------------------------------------------------
  3716. StartOfPI Beginning of a HTML comment.
  3717. ----------------------------------------------------------------------*/
  3718. static void StartOfPI (char c)
  3719. {
  3720. ElementType elType;
  3721. Element elPI, elPILine;
  3722. /* create a Thot element PI */
  3723. elType.ElSSchema = DocumentSSchema;
  3724. elType.ElTypeNum = HTML_EL_XMLPI;
  3725. elPI = TtaNewElement (HTMLcontext.doc, elType);
  3726. TtaSetElementLineNumber (elPI, NumberOfLinesRead);
  3727. InsertElement (&elPI);
  3728. /* create a PI_line element as the first child of */
  3729. /* element PI */
  3730. if (elPI != NULL)
  3731. {
  3732. elType.ElTypeNum = HTML_EL_PI_line;
  3733. elPILine = TtaNewElement (HTMLcontext.doc, elType);
  3734. TtaSetElementLineNumber (elPILine, NumberOfLinesRead);
  3735. TtaInsertFirstChild (&elPILine, elPI, HTMLcontext.doc);
  3736. /* create a TEXT element as the first child of element PI_line */
  3737. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3738. PIText = TtaNewElement (HTMLcontext.doc, elType);
  3739. TtaSetElementLineNumber (PIText, NumberOfLinesRead);
  3740. TtaInsertFirstChild (&PIText, elPILine, HTMLcontext.doc);
  3741. TtaSetTextContent (PIText, (unsigned char *)"", HTMLcontext.language,
  3742. HTMLcontext.doc);
  3743. }
  3744. InitBuffer ();
  3745. }
  3746. /*----------------------------------------------------------------------
  3747. PutInPI put character c in the current HTML comment.
  3748. ----------------------------------------------------------------------*/
  3749. static void PutInPI (unsigned char c)
  3750. {
  3751. ElementType elType;
  3752. Element elPILine, prevElPILine;
  3753. if (c != EOS)
  3754. {
  3755. if (!HTMLcontext.parsingCSS && ((int) c == EOL || (int) c == __CR__))
  3756. /* new line in a comment */
  3757. {
  3758. /* put the content of the inputBuffer into the current */
  3759. /* PI_line element */
  3760. CloseBuffer ();
  3761. TtaAppendTextContent (PIText, (unsigned char *)inputBuffer, HTMLcontext.doc);
  3762. InitBuffer ();
  3763. /* create a new PI_line element */
  3764. elType.ElSSchema = DocumentSSchema;
  3765. elType.ElTypeNum = HTML_EL_PI_line;
  3766. elPILine = TtaNewElement (HTMLcontext.doc, elType);
  3767. TtaSetElementLineNumber (elPILine, NumberOfLinesRead);
  3768. /* inserts the new PI_line element after the previous one */
  3769. prevElPILine = TtaGetParent (PIText);
  3770. TtaInsertSibling (elPILine, prevElPILine, FALSE, HTMLcontext.doc);
  3771. /* create a TEXT element as the first child of the new element
  3772. PI_line */
  3773. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  3774. PIText = TtaNewElement (HTMLcontext.doc, elType);
  3775. TtaSetElementLineNumber (PIText, NumberOfLinesRead);
  3776. TtaInsertFirstChild (&PIText, elPILine, HTMLcontext.doc);
  3777. TtaSetTextContent (PIText, (unsigned char *)"", HTMLcontext.language, HTMLcontext.doc);
  3778. }
  3779. else
  3780. {
  3781. if (LgBuffer >= MaxBufferLength - 1)
  3782. {
  3783. CloseBuffer ();
  3784. TtaAppendTextContent (PIText, (unsigned char *)inputBuffer,
  3785. HTMLcontext.doc);
  3786. InitBuffer ();
  3787. }
  3788. inputBuffer[LgBuffer++] = c;
  3789. }
  3790. }
  3791. }
  3792. /*----------------------------------------------------------------------
  3793. EndOfPI End of a HTML PI
  3794. ----------------------------------------------------------------------*/
  3795. static void EndOfPI (char c)
  3796. {
  3797. if (LgBuffer > 0)
  3798. {
  3799. CloseBuffer ();
  3800. if (PIText != NULL)
  3801. TtaAppendTextContent (PIText, (unsigned char *)inputBuffer,
  3802. HTMLcontext.doc);
  3803. }
  3804. PIText = NULL;
  3805. HTMLcontext.lastElementClosed = TRUE;
  3806. InitBuffer ();
  3807. }
  3808. /*----------------------------------------------------------------------
  3809. Do_nothing does nothing.
  3810. ----------------------------------------------------------------------*/
  3811. static void Do_nothing (char c)
  3812. {
  3813. }
  3814. /* some type definitions for the automaton */
  3815. typedef struct _Transition *PtrTransition;
  3816. typedef struct _Transition
  3817. { /* a transition of the automaton in
  3818. "executable" form */
  3819. unsigned char trigger; /* the imput character that triggers
  3820. the transition */
  3821. Proc action; /* the procedure to be called when
  3822. the transition occurs */
  3823. State newState; /* the new state of the automaton
  3824. after the transition */
  3825. PtrTransition nextTransition; /* next transition from the same
  3826. state */
  3827. }
  3828. Transition;
  3829. typedef struct _StateDescr
  3830. { /* a state of the automaton */
  3831. State automatonState; /* the state */
  3832. PtrTransition firstTransition; /* first transition from that state */
  3833. }
  3834. StateDescr;
  3835. /* the automaton that drives the HTML parser */
  3836. #define MaxState 40
  3837. static StateDescr automaton[MaxState];
  3838. typedef struct _sourceTransition
  3839. { /* a transition of the automaton in
  3840. "source" form */
  3841. State initState; /* initial state of transition */
  3842. char trigger; /* the imput character that triggers
  3843. the transition */
  3844. Proc transitionAction; /* the procedure to be called when
  3845. the transition occurs */
  3846. State newState; /* final state of the transition */
  3847. }
  3848. sourceTransition;
  3849. /* the automaton in "source" form */
  3850. static sourceTransition sourceAutomaton[] =
  3851. {
  3852. /*
  3853. state, trigger, action, new state
  3854. */
  3855. /* state 0: reading character data */
  3856. {0, '<', (Proc) StartOfTag, 1},
  3857. {0, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3858. {0, '*', (Proc) PutInBuffer, 0}, /* * = any other character */
  3859. /* state 1: '<' has been read */
  3860. {1, '%', (Proc) StartOfASP, 35},
  3861. {1, '/', (Proc) Do_nothing, 3},
  3862. {1, '!', (Proc) Do_nothing, 10},
  3863. {1, '?', (Proc) StartOfPI, 20},
  3864. {1, '<', (Proc) Do_nothing, 18},
  3865. {1, 'S', (Proc) PutLessAndSpace, 0}, /* S = Space */
  3866. {1, '*', (Proc) PutInBuffer, 2},
  3867. /* state 2: reading a start tag */
  3868. {2, '>', (Proc) EndOfStartGIandTag, 0},
  3869. {2, '<', (Proc) EndOfStartGIandTag, 1}, /* Error: tag not closed */
  3870. {2, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3871. {2, 'S', (Proc) EndOfStartGI, 16}, /* S = Space */
  3872. {2, '*', (Proc) PutInBuffer, 2},
  3873. /* state 3: reading an end tag */
  3874. {3, '>', (Proc) EndOfEndTag, 0},
  3875. {3, '<', (Proc) EndOfEndTag, 1}, /* Error: tag not closed */
  3876. {3, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3877. {3, 'S', (Proc) Do_nothing, 3},
  3878. {3, '*', (Proc) PutInBuffer, 3},
  3879. /* state 4: reading an attribute name */
  3880. {4, '=', (Proc) EndOfAttrName, 5},
  3881. {4, 'S', (Proc) EndOfAttrName, 17},
  3882. {4, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3883. {4, '>', (Proc) EndOfAttrNameAndTag, 0},
  3884. {4, '<', (Proc) EndOfAttrNameAndTag, 1}, /* Error: tag not closed */
  3885. {4, '*', (Proc) PutInBuffer, 4},
  3886. /* state 5: expecting an attribute value */
  3887. {5, '\"', (Proc) StartOfQuotedAttrValue, 6},
  3888. {5, '\'', (Proc) StartOfQuotedAttrValue, 9},
  3889. {5, 'S', (Proc) Do_nothing, 5},
  3890. {5, '>', (Proc) EndOfStartTag, 0},
  3891. {5, '*', (Proc) StartOfUnquotedAttrValue, 7},
  3892. /* state 6: reading an attribute value between double quotes */
  3893. {6, '\"', (Proc) EndOfAttrValue, 8},
  3894. {6, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30... */
  3895. {6, '*', (Proc) PutInBuffer, 6},
  3896. /* state 7: reading an attribute value without delimiting quotes */
  3897. {7, '>', (Proc) EndOfAttrValueAndTag, 0},
  3898. {7, 'S', (Proc) EndOfAttrValue, 16},
  3899. {7, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3900. {7, '*', (Proc) PutInBuffer, 7},
  3901. /* state 8: end of attribute value */
  3902. {8, '>', (Proc) EndOfStartTag, 0},
  3903. {8, '<', (Proc) EndOfStartTag, 1}, /* Error: tag not closed */
  3904. {8, 'S', (Proc) Do_nothing, 16},
  3905. {8, '*', (Proc) PutInBuffer, 4},
  3906. /* state 9: reading an attribute value between simple quotes */
  3907. {9, '\'', (Proc) EndOfAttrValue, 8},
  3908. {9, '&', (Proc) StartOfEntity, -30}, /* call subautomaton 30 */
  3909. {9, '*', (Proc) PutInBuffer, 9},
  3910. /* state 10: "<!" has been read */
  3911. {10, '-', (Proc) Do_nothing, 11},
  3912. {10, '[', (Proc) Do_nothing, 23}, /* call subautomaton 23 */
  3913. {10, 'S', (Proc) Do_nothing, 10},
  3914. {10, '>', (Proc) Do_nothing, 0}, /* weird empty comment <!> */
  3915. {10, '*', (Proc) PutInBuffer, 15},
  3916. /* state 11: "<!-" has been read. Probably a comment */
  3917. {11, '-', (Proc) StartOfComment, 12},
  3918. {11, '*', (Proc) PutInBuffer, 15}, /* incorrect comment, expect */
  3919. /* a closing '>' */
  3920. /* state 12: reading a comment */
  3921. {12, '-', (Proc) Do_nothing, 13},
  3922. {12, '*', (Proc) PutInComment, 12},
  3923. /* state 13: a dash "-" has been read within a comment */
  3924. {13, '-', (Proc) Do_nothing, 14},
  3925. {13, '*', (Proc) PutDash, 12},
  3926. /* state 14: a double dash "--" has been read within a comment */
  3927. {14, 'S', (Proc) Do_nothing, 14},
  3928. {14, '>', (Proc) EndOfComment, 0},
  3929. {14, '-', (Proc) PutInComment, 14},
  3930. {14, '*', (Proc) PutDashDash, 12},
  3931. /* state 15: '<!' has been read. It may be a doctype declaration */
  3932. {15, '>', (Proc) EndOfDoctypeDecl, 0},
  3933. {15, '*', (Proc) PutInBuffer, 15},
  3934. /* state 16: expecting an attribute name or an end of start tag */
  3935. {16, 'S', (Proc) Do_nothing, 16},
  3936. {16, '>', (Proc) EndOfStartTag, 0},
  3937. {16, '<', (Proc) EndOfStartTag, 1}, /* Error: tag not closed */
  3938. {16, '*', (Proc) PutInBuffer, 4},
  3939. /* state 17: expecting '=' after an attribute name */
  3940. {17, 'S', (Proc) Do_nothing, 17},
  3941. {17, '=', (Proc) Do_nothing, 5},
  3942. {17, '>', (Proc) EndOfStartTag, 0},
  3943. {17, '*', (Proc) PutInBuffer, 4},
  3944. /* state 18: '<' has been read */
  3945. {18, '!', (Proc) Do_nothing, 19},
  3946. {18, '*', (Proc) Do_nothing, 0},
  3947. /* state 19: '<!' has been read */
  3948. {19, '>', (Proc) PutLess, 0},
  3949. {19, '*', (Proc) Do_nothing, 0},
  3950. /* state 20: "<?" has been read; beginning of a Processing Instruction */
  3951. {20, '?', (Proc) Do_nothing, 21},
  3952. {20, '*', (Proc) PutInPI, 20},
  3953. /* state 21: reading the end of Processing Instruction? */
  3954. {21, '>', (Proc) EndOfPI, 0},
  3955. {21, '*', (Proc) PutQuestionMark, 20},
  3956. /* state 22: a question mark has been read in a Processing Instruction */
  3957. {22, '>', (Proc) EndOfPI, 0},
  3958. {22, '*', (Proc) PutQuestionMark, 20},
  3959. /* state 23: "<![*" has been read, wait for CDATA */
  3960. {23, '[', (Proc) StartCData, 24},
  3961. {23, 'C', (Proc) PutInBuffer, 23},
  3962. {23, 'D', (Proc) PutInBuffer, 23},
  3963. {23, 'A', (Proc) PutInBuffer, 23},
  3964. {23, 'T', (Proc) PutInBuffer, 23},
  3965. {23, ']', (Proc) PutInBuffer, -1},
  3966. {23, '*', (Proc) PutInBuffer, 15},
  3967. /* state 24: "<![CDATA[" has been read: read its contents */
  3968. {24, ']', (Proc) PutInBuffer, 25},
  3969. {24, '\n', (Proc) EndOfCDataLine, 24},
  3970. {24, '*', (Proc) PutInBuffer, 24},
  3971. /* state 25: "]" has been read: check the second "]" */
  3972. {25, ']', (Proc) PutInBuffer, 26},
  3973. {25, '*', (Proc) PutInBuffer, 24},
  3974. /* state 26: "]]" has been read: check the end of CDATA */
  3975. {26, ']', (Proc) PutInBuffer, 26},
  3976. {26, '>', (Proc) EndOfCData, 0},
  3977. {26, '*', (Proc) PutInBuffer, 24},
  3978. /* sub automaton for reading entities in various contexts */
  3979. /* state -1 means "return to calling state" */
  3980. /* state 30: a '&' has been read */
  3981. {30, '#', (Proc) Do_nothing, 32},
  3982. {30, 'S', (Proc) PutAmpersandSpace, -1}, /* return to calling state */
  3983. {30, '*', (Proc) EntityChar, 31},
  3984. /* state 31: reading a string entity */
  3985. {31, ';', (Proc) EndOfEntity, -1}, /* return to calling state */
  3986. {31, '*', (Proc) EntityChar, 31},
  3987. /* state 32: "&#" has been read: reading a numerical entity */
  3988. {32, 'x', (Proc) Do_nothing, 34},
  3989. {32, 'X', (Proc) Do_nothing, 34},
  3990. {32, '*', (Proc) DecEntityChar, 33},
  3991. /* state 33: "&#x" has been read: reading a decimal value */
  3992. {33, ';', (Proc) EndOfDecEntity, -1}, /* return to calling state */
  3993. {33, '*', (Proc) DecEntityChar, 33},
  3994. /* state 34: "&#x" has been read: reading an hexadecimal value */
  3995. {34, ';', (Proc) EndOfHexEntity, -1}, /* return to calling state */
  3996. {34, '*', (Proc) HexEntityChar, 34},
  3997. /* state 35: reading a ASP */
  3998. {35, '%', (Proc) PutInASP, 36},
  3999. {35, '*', (Proc) PutInASP, 35},
  4000. /* state 36: reading a ASP */
  4001. {36, '>', (Proc) EndOfASP, 0},
  4002. {36, '*', (Proc) PutInASP, 35},
  4003. /* state 1000: fake state. End of automaton table */
  4004. /* the next line must be the last one in the automaton declaration */
  4005. {1000, '*', (Proc) Do_nothing, 1000}
  4006. };
  4007. /*----------------------------------------------------------------------
  4008. InitAutomaton read the "source" form of the automaton and
  4009. build the "executable" form.
  4010. ----------------------------------------------------------------------*/
  4011. void InitAutomaton (void)
  4012. {
  4013. int entry;
  4014. State theState;
  4015. State curState;
  4016. PtrTransition trans;
  4017. PtrTransition prevTrans;
  4018. for (entry = 0; entry < MaxState; entry++)
  4019. automaton[entry].firstTransition = NULL;
  4020. entry = 0;
  4021. curState = 1000;
  4022. prevTrans = NULL;
  4023. do
  4024. {
  4025. theState = sourceAutomaton[entry].initState;
  4026. if (theState < 1000)
  4027. {
  4028. trans = (PtrTransition) TtaGetMemory (sizeof (Transition));
  4029. trans->nextTransition = NULL;
  4030. trans->trigger = sourceAutomaton[entry].trigger;
  4031. trans->action = sourceAutomaton[entry].transitionAction;
  4032. trans->newState = sourceAutomaton[entry].newState;
  4033. if (trans->trigger == 'S') /* any spacing character */
  4034. trans->trigger = SPACE;
  4035. if (trans->trigger == '*') /* any character */
  4036. trans->trigger = EOS;
  4037. if (theState != curState)
  4038. {
  4039. automaton[theState].automatonState = theState;
  4040. automaton[theState].firstTransition = trans;
  4041. curState = theState;
  4042. }
  4043. else
  4044. prevTrans->nextTransition = trans;
  4045. prevTrans = trans;
  4046. entry++;
  4047. }
  4048. }
  4049. while (theState < 1000);
  4050. }
  4051. /*----------------------------------------------------------------------
  4052. FreeHTMLParser
  4053. Frees all ressources associated with the HTML parser.
  4054. ----------------------------------------------------------------------*/
  4055. void FreeHTMLParser (void)
  4056. {
  4057. PtrTransition trans, nextTrans;
  4058. PtrClosedElement pClose, nextClose;
  4059. int entry;
  4060. /* free the internal representation of the automaton */
  4061. for (entry = 0; entry < MaxState; entry++)
  4062. {
  4063. trans = automaton[entry].firstTransition;
  4064. while (trans != NULL)
  4065. {
  4066. nextTrans = trans->nextTransition;
  4067. TtaFreeMemory (trans);
  4068. trans = nextTrans;
  4069. }
  4070. }
  4071. /* free descriptors of elements closed by a start tag */
  4072. for (entry = 0; pHTMLGIMapping[entry].XMLname[0] != EOS; entry++)
  4073. {
  4074. pClose = FirstClosedElem[entry];
  4075. while (pClose != NULL)
  4076. {
  4077. nextClose = pClose->nextClosedElem;
  4078. TtaFreeMemory (pClose);
  4079. pClose = nextClose;
  4080. }
  4081. }
  4082. TtaFreeMemory (FirstClosedElem);
  4083. FirstClosedElem = NULL;
  4084. }
  4085. /*----------------------------------------------------------------------
  4086. GetNextHTMLbuffer returns the next buffer to be parsed and update
  4087. global variables.
  4088. ----------------------------------------------------------------------*/
  4089. void GetNextHTMLbuffer (FILE *infile, ThotBool *endOfFile,
  4090. char **buff, int *lastchar)
  4091. {
  4092. CHARSET charset = TtaGetDocumentCharset (HTMLcontext.doc);
  4093. int res;
  4094. // copy last treated characters
  4095. if (StartOfTagIndx > 0)
  4096. {
  4097. strncpy (PreviousRead, &WorkBuffer[StartOfTagIndx], PREV_READ_CHARS);
  4098. LastCharInPreviousRead = LastCharInWorkBuffer - StartOfTagIndx;
  4099. StartOfTagIndx = 0;
  4100. }
  4101. else
  4102. {
  4103. PreviousRead[0] = EOS;
  4104. LastCharInPreviousRead = 0;
  4105. }
  4106. *buff = NULL;
  4107. // free previous translation buffer
  4108. if (WorkBuffer != FileBuffer)
  4109. {
  4110. TtaFreeMemory (WorkBuffer);
  4111. WorkBuffer = FileBuffer;
  4112. }
  4113. // need to read a new set of characters
  4114. LastCharInWorkBuffer = gzread (infile, &FileBuffer[StartOfRead],
  4115. INPUT_FILE_BUFFER_SIZE - StartOfRead);
  4116. // add previous read characters not managed yet
  4117. LastCharInWorkBuffer += StartOfRead;
  4118. StartOfRead = 0;
  4119. if (LastCharInWorkBuffer <= 0)
  4120. {
  4121. /* error or end of file */
  4122. *endOfFile = TRUE;
  4123. LastCharInWorkBuffer = 0;
  4124. }
  4125. else
  4126. {
  4127. FileBuffer[LastCharInWorkBuffer] = EOS;
  4128. LastCharInWorkBuffer--;
  4129. if (charset == ISO_8859_2 || charset == ISO_8859_3 ||
  4130. charset == ISO_8859_4 || charset == ISO_8859_5 ||
  4131. charset == ISO_8859_6 || charset == ISO_8859_7 ||
  4132. charset == ISO_8859_8 || charset == ISO_8859_9 ||
  4133. charset == ISO_8859_15 || charset == KOI8_R ||
  4134. charset == WINDOWS_1250 || charset == WINDOWS_1251 ||
  4135. charset == WINDOWS_1252 || charset == WINDOWS_1253 ||
  4136. charset == WINDOWS_1254 || charset == WINDOWS_1255 ||
  4137. charset == WINDOWS_1256 || charset == WINDOWS_1257 ||
  4138. charset == ISO_2022_JP || charset == EUC_JP ||
  4139. charset == SHIFT_JIS || charset == GB_2312)
  4140. {
  4141. /* convert the original stream into UTF-8 */
  4142. *buff = (char *)TtaConvertByteToMbsWithCheck ((const unsigned char *)FileBuffer,
  4143. charset, &res);
  4144. HTMLcontext.encoding = UTF_8;
  4145. if (*buff)
  4146. {
  4147. WorkBuffer = *buff;
  4148. if (res > 0 && res < INPUT_FILE_BUFFER_SIZE)
  4149. {
  4150. // keep last characters for the next read
  4151. StartOfRead = INPUT_FILE_BUFFER_SIZE - res;
  4152. strcpy (FileBuffer, &FileBuffer[res]);
  4153. }
  4154. LastCharInWorkBuffer = strlen (*buff) - 1;
  4155. }
  4156. }
  4157. }
  4158. *lastchar = LastCharInWorkBuffer;
  4159. }
  4160. /*----------------------------------------------------------------------
  4161. GetNextChar returns the next character in the imput file or buffer,
  4162. whatever it is.
  4163. ----------------------------------------------------------------------*/
  4164. static char GetNextChar (FILE *infile, char* buffer, int *index,
  4165. ThotBool *endOfFile)
  4166. {
  4167. wchar_t wcharRead = EOS;
  4168. unsigned char charRead;
  4169. unsigned char fallback[8];
  4170. unsigned char *ptr;
  4171. char *buff;
  4172. int res = 0;
  4173. charRead = EOS;
  4174. *endOfFile = FALSE;
  4175. if (buffer)
  4176. {
  4177. /* read from a buffer */
  4178. if (SecondByte[0] != EOS)
  4179. {
  4180. /* return the second UTF-8 byte */
  4181. charRead = SecondByte[0];
  4182. /* shift */
  4183. strncpy ((char *)SecondByte, (char *)&SecondByte[1], 4);
  4184. }
  4185. else
  4186. {
  4187. charRead = buffer[(*index)++];
  4188. if (charRead == EOS)
  4189. *endOfFile = TRUE;
  4190. else
  4191. {
  4192. if (HTMLcontext.encoding != UTF_8)
  4193. {
  4194. /* translate ISO-latin characters into a UTF-8 string */
  4195. ptr = fallback;
  4196. wcharRead = TtaGetWCFromChar (charRead, HTMLcontext.encoding);
  4197. res = TtaWCToMBstring (wcharRead, &ptr);
  4198. /* handle the first character */
  4199. charRead = fallback[0];
  4200. if (res > 1)
  4201. {
  4202. /* store the second UTF-8 byte */
  4203. res--;
  4204. strncpy ((char *)SecondByte, (char *)&fallback[1], res);
  4205. SecondByte[res] = EOS;
  4206. }
  4207. }
  4208. }
  4209. }
  4210. }
  4211. else if (infile == NULL)
  4212. *endOfFile = TRUE;
  4213. else
  4214. {
  4215. /* read from a file */
  4216. if (*index == 0 && SecondByte[0] == EOS)
  4217. {
  4218. if (NotToReadFile)
  4219. NotToReadFile = FALSE;
  4220. else
  4221. {
  4222. // read next characters
  4223. GetNextHTMLbuffer (infile, endOfFile, &buff, &res);
  4224. if (*endOfFile)
  4225. /* error or end of file */
  4226. charRead = EOS;
  4227. }
  4228. }
  4229. if (NotToReadFile)
  4230. {
  4231. charRead = PreviousRead[(*index)++];
  4232. if (*index > LastCharInPreviousRead)
  4233. *index = 0;
  4234. }
  4235. else if (*endOfFile == FALSE)
  4236. {
  4237. if (SecondByte[0] != EOS)
  4238. {
  4239. /* return the second UTF-8 byte */
  4240. charRead = SecondByte[0];
  4241. /* shift */
  4242. strncpy ((char *)SecondByte, (char *)&SecondByte[1], 4);
  4243. }
  4244. else
  4245. {
  4246. charRead = WorkBuffer[(*index)++];
  4247. if (charRead == EOS)
  4248. *endOfFile = TRUE;
  4249. else
  4250. {
  4251. if (HTMLcontext.encoding == UTF_8)
  4252. {
  4253. /* Search for an UTF-8 BOM character (EF BB BF) */
  4254. /* Laurent Carcone 7/11/2002 */
  4255. if (*index == 1 && LastCharInWorkBuffer > 2 &&
  4256. (unsigned char) WorkBuffer[0] == 0xEF &&
  4257. (unsigned char) WorkBuffer[1] == 0xBB &&
  4258. (unsigned char) WorkBuffer[2] == 0xBF &&
  4259. PreviousRead[0] == EOS)
  4260. {
  4261. charRead = WorkBuffer[(*index)++];
  4262. charRead = WorkBuffer[(*index)++];
  4263. charRead = WorkBuffer[(*index)++];
  4264. }
  4265. }
  4266. else
  4267. {
  4268. /* translate the ISO-latin-1 character into a UTF-8 string */
  4269. ptr = fallback;
  4270. fallback[1] = EOS;
  4271. wcharRead = TtaGetWCFromChar (charRead, HTMLcontext.encoding);
  4272. res = TtaWCToMBstring (wcharRead, &ptr);
  4273. /* handle the first character */
  4274. charRead = fallback[0];
  4275. if (res > 1)
  4276. {
  4277. /* store the second UTF-8 byte */
  4278. res--;
  4279. strncpy ((char *)SecondByte, (char *)&fallback[1], res);
  4280. SecondByte[res] = EOS;
  4281. }
  4282. }
  4283. }
  4284. }
  4285. if (*index > LastCharInWorkBuffer)
  4286. *index = 0;
  4287. }
  4288. }
  4289. return charRead;
  4290. }
  4291. /*----------------------------------------------------------------------
  4292. SetElemLineNumber
  4293. assigns the current line number (number of latest line read from the
  4294. input file) to element el.
  4295. ----------------------------------------------------------------------*/
  4296. void SetElemLineNumber (Element el)
  4297. {
  4298. TtaSetElementLineNumber (el, NumberOfLinesRead);
  4299. }
  4300. /*----------------------------------------------------------------------
  4301. GetNextInputChar returns the next non-null character in the input
  4302. file or buffer.
  4303. ----------------------------------------------------------------------*/
  4304. char GetNextInputChar (FILE *infile, int *index, ThotBool *endOfFile)
  4305. {
  4306. char charRead;
  4307. static ThotBool beg_pair;
  4308. charRead = EOS;
  4309. *endOfFile = FALSE;
  4310. if (PreviousBufChar != EOS)
  4311. {
  4312. charRead = PreviousBufChar;
  4313. PreviousBufChar = EOS;
  4314. }
  4315. else
  4316. {
  4317. charRead = GetNextChar (infile, InputText, index, endOfFile);
  4318. if (InputText == NULL)
  4319. NumberOfCharRead++;
  4320. /* skip null characters*/
  4321. while (charRead == EOS && !*endOfFile)
  4322. {
  4323. charRead = GetNextChar (infile, InputText, index, endOfFile);
  4324. if (InputText == NULL)
  4325. NumberOfCharRead++;
  4326. }
  4327. }
  4328. if (*endOfFile == FALSE)
  4329. {
  4330. if ((int) charRead == __CR__)
  4331. /* CR has been read */
  4332. {
  4333. /* Read next character */
  4334. charRead = GetNextChar (infile, InputText, index, endOfFile);
  4335. if ((int) charRead != EOL && (int) charRead != __CR__)
  4336. /* next character is not LF. Store next character and return LF */
  4337. {
  4338. PreviousBufChar = charRead;
  4339. charRead = EOL;
  4340. }
  4341. }
  4342. /* update the counters of characters and lines read */
  4343. if ((int) charRead == EOL || (int) charRead == __CR__)
  4344. {
  4345. if ((int) charRead == __CR__)
  4346. {
  4347. beg_pair = TRUE;
  4348. if (InputText == NULL)
  4349. NumberOfLinesRead++;
  4350. NumberOfCharRead = 0;
  4351. }
  4352. else
  4353. {
  4354. if (!beg_pair)
  4355. {
  4356. if (InputText == NULL)
  4357. NumberOfLinesRead++;
  4358. NumberOfCharRead = 0;
  4359. }
  4360. else
  4361. beg_pair = FALSE;
  4362. }
  4363. }
  4364. else
  4365. beg_pair = FALSE;
  4366. }
  4367. return charRead;
  4368. }
  4369. /*----------------------------------------------------------------------
  4370. HTMLparse
  4371. Parse either the HTML file infile or the text buffer HTMLbuf and
  4372. build the equivalent Thot abstract tree.
  4373. One parameter should be NULL.
  4374. ----------------------------------------------------------------------*/
  4375. static void HTMLparse (FILE * infile, char* HTMLbuf)
  4376. {
  4377. unsigned char charRead;
  4378. PtrTransition trans;
  4379. ThotBool match;
  4380. currentState = 0;
  4381. if (HTMLbuf != NULL || infile != NULL)
  4382. {
  4383. InputText = HTMLbuf;
  4384. EndOfHtmlFile = FALSE;
  4385. }
  4386. charRead = EOS;
  4387. HTMLrootClosed = FALSE;
  4388. /* read the HTML file sequentially */
  4389. do
  4390. {
  4391. /* read one character from the source if the last character */
  4392. /* read has been processed */
  4393. if (charRead == EOS)
  4394. charRead = GetNextInputChar (infile, &CurrentBufChar, &EndOfHtmlFile);
  4395. if (charRead != EOS)
  4396. {
  4397. /* Check the character read */
  4398. /* Consider LF and FF as the end of an input line. */
  4399. /* Replace end of line by space, except in preformatted text. */
  4400. /* Replace HT by space, except in preformatted text. */
  4401. /* Ignore spaces at the beginning and at the end of input lines */
  4402. /* Ignore non printable characters except HT, LF, FF. */
  4403. if ((int) charRead == EOL || (int) charRead == __CR__)
  4404. /* LF = end of input line */
  4405. {
  4406. /* don't replace end of line by space in a doctype declaration */
  4407. if (currentState != 12 && currentState != 15 &&
  4408. currentState != 20 && currentState != 21 &&
  4409. currentState != 24 && currentState != 35)
  4410. {
  4411. /* don't change characters in comments */
  4412. if (currentState != 0)
  4413. /* not within a text element */
  4414. {
  4415. if (currentState == 6 || currentState == 9)
  4416. /* within an attribute value between quotes */
  4417. if (lastAttrEntry != NULL &&
  4418. !strcmp (lastAttrEntry->XMLattribute, "src"))
  4419. /* value of an SRC attribute */
  4420. /* consider new line as an empty char*/
  4421. charRead = EOS;
  4422. if (charRead != EOS)
  4423. {
  4424. /* Replace new line by a space, except if an entity is
  4425. being read */
  4426. if (currentState == 30 &&
  4427. Within (HTML_EL_Preformatted, DocumentSSchema) &&
  4428. !Within (HTML_EL_Option_Menu, DocumentSSchema))
  4429. charRead = EOL; /* new line character */
  4430. else
  4431. charRead = SPACE;
  4432. }
  4433. }
  4434. else if ((Within (HTML_EL_Preformatted, DocumentSSchema) &&
  4435. !Within (HTML_EL_Option_Menu, DocumentSSchema)) ||
  4436. Within (HTML_EL_Text_Area, DocumentSSchema) ||
  4437. Within (HTML_EL_SCRIPT_, DocumentSSchema) ||
  4438. Within (HTML_EL_STYLE_, DocumentSSchema))
  4439. /* new line in a text element */
  4440. {
  4441. /* within preformatted text */
  4442. if (AfterTagPRE)
  4443. /* ignore NL after a <PRE> tag */
  4444. charRead = EOS;
  4445. else
  4446. /* generate a new line character */
  4447. charRead = EOL;
  4448. }
  4449. else
  4450. /* new line in ordinary text */
  4451. {
  4452. /* suppress all spaces preceding the end of line */
  4453. while (LgBuffer > 0 &&
  4454. inputBuffer[LgBuffer - 1] == SPACE)
  4455. LgBuffer--;
  4456. /* new line is equivalent to space */
  4457. charRead = SPACE;
  4458. if (LgBuffer > 0)
  4459. TextToDocument ();
  4460. }
  4461. }
  4462. /* beginning of a new input line */
  4463. EmptyLine = TRUE;
  4464. }
  4465. else
  4466. /* it's not an end of line */
  4467. {
  4468. if ((int) charRead == TAB)
  4469. /* HT = Horizontal tabulation */
  4470. {
  4471. if (currentState != 0)
  4472. /* not in a text element. Replace HT by space */
  4473. charRead = SPACE;
  4474. else
  4475. /* in a text element. Replace HT by space except in */
  4476. /* preformatted text */
  4477. if (!Within (HTML_EL_Preformatted, DocumentSSchema) &&
  4478. !Within (HTML_EL_STYLE_, DocumentSSchema) &&
  4479. !Within (HTML_EL_SCRIPT_, DocumentSSchema))
  4480. charRead = SPACE;
  4481. }
  4482. if (charRead == SPACE)
  4483. /* space character */
  4484. {
  4485. if (currentState == 12 || currentState == 35 ||
  4486. currentState == 20 || currentState == 21 ||
  4487. (currentState == 0 &&
  4488. !Within (HTML_EL_Preformatted, DocumentSSchema) &&
  4489. !Within (HTML_EL_STYLE_, DocumentSSchema) &&
  4490. !Within (HTML_EL_SCRIPT_, DocumentSSchema) &&
  4491. !Within (HTML_EL_Text_Area, DocumentSSchema)))
  4492. {
  4493. if (EmptyLine)
  4494. /* ignore spaces at the beginning of an input line */
  4495. charRead = EOS;
  4496. else if (LgBuffer > 0 && inputBuffer[LgBuffer-1] == SPACE)
  4497. /* ignore multiple spaces */
  4498. charRead = EOS;
  4499. }
  4500. }
  4501. else
  4502. /* it's a printable character. Keep it as it is and */
  4503. /* stop ignoring spaces */
  4504. {
  4505. EmptyLine = FALSE;
  4506. StartOfFile = FALSE;
  4507. }
  4508. }
  4509. AfterTagPRE = FALSE;
  4510. if (charRead != EOS)
  4511. /* a valid character has been read */
  4512. {
  4513. /* first transition of the automaton for the current state */
  4514. trans = automaton[currentState].firstTransition;
  4515. /* search a transition triggered by the character read */
  4516. while (trans != NULL && !HTMLrootClosed)
  4517. {
  4518. match = FALSE;
  4519. if (charRead == trans->trigger)
  4520. /* the char is the trigger */
  4521. match = TRUE;
  4522. else if (trans->trigger == EOS)
  4523. /* any char is a trigger */
  4524. match = TRUE;
  4525. else if (trans->trigger == SPACE)
  4526. /* any space is a trigger */
  4527. if ((int) charRead == TAB ||
  4528. (int) charRead == EOL ||
  4529. (int) charRead == 12)
  4530. /* a delimiter has been read */
  4531. match = TRUE;
  4532. if (match)
  4533. /* transition found. Activate the transition */
  4534. {
  4535. NormalTransition = TRUE;
  4536. /* call the procedure associated with the transition */
  4537. CharProcessed = FALSE;
  4538. if (trans->action != NULL)
  4539. (*((Proc1)trans->action)) ((void *)(int)charRead);
  4540. if (NormalTransition || CharProcessed)
  4541. /* the input character has been processed */
  4542. charRead = EOS;
  4543. if (NormalTransition)
  4544. {
  4545. /* the procedure associated with the transition has not */
  4546. /* changed state explicitely */
  4547. /* change current automaton state */
  4548. if (trans->newState >= 0)
  4549. currentState = trans->newState;
  4550. else if (trans->newState == -1)
  4551. /* return form subautomaton */
  4552. currentState = returnState;
  4553. else
  4554. /* calling a subautomaton */
  4555. {
  4556. returnState = currentState;
  4557. currentState = -trans->newState;
  4558. }
  4559. }
  4560. /* done */
  4561. trans = NULL;
  4562. }
  4563. else
  4564. /* access next transition from the same state */
  4565. {
  4566. trans = trans->nextTransition;
  4567. /* an exception: when reading the value of an HREF attribute,
  4568. SGML entities (&xxx;) should not be interpreted */
  4569. if (trans == NULL)
  4570. charRead = EOS;
  4571. }
  4572. }
  4573. }
  4574. }
  4575. }
  4576. while (!EndOfHtmlFile && !HTMLrootClosed);
  4577. /* end of HTML file */
  4578. if (!HTMLrootClosed)
  4579. EndOfDocument ();
  4580. HTMLrootClosingTag = NULL;
  4581. HTMLrootClosed = FALSE;
  4582. }
  4583. /*----------------------------------------------------------------------
  4584. GetANewText generates a new text element within a line.
  4585. ----------------------------------------------------------------------*/
  4586. static Element GetANewText (Element el, ElementType elType, Document doc)
  4587. {
  4588. Element elLeaf;
  4589. if (LgBuffer)
  4590. {
  4591. inputBuffer[LgBuffer] = EOS;
  4592. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  4593. LgBuffer = 0;
  4594. /* Create a new text leaf */
  4595. elType.ElTypeNum = TextFile_EL_TEXT_UNIT;
  4596. elLeaf = TtaNewElement (doc, elType);
  4597. TtaSetElementLineNumber (elLeaf, NumberOfLinesRead);
  4598. TtaInsertSibling (elLeaf, el, FALSE, doc);
  4599. el = elLeaf;
  4600. HTMLcontext.lastElement = el;
  4601. }
  4602. return el;
  4603. }
  4604. /*----------------------------------------------------------------------
  4605. ReadTextFile
  4606. read plain text file into a TextFile document.
  4607. input text comes from either the infile file or the text
  4608. buffer textbuf. One parameter should be NULL.
  4609. ----------------------------------------------------------------------*/
  4610. static void ReadTextFile (FILE *infile, char *textbuf, Document doc,
  4611. const char *pathURL)
  4612. {
  4613. Element parent, el, prev;
  4614. Element elLeaf;
  4615. ElementType elType;
  4616. AttributeType attrType;
  4617. Attribute attr;
  4618. unsigned char charRead;
  4619. int val;
  4620. ThotBool endOfTextFile, color_source;
  4621. ThotBool withinMarkup = FALSE;
  4622. ThotBool withinQuote = FALSE, withinString = FALSE;
  4623. ThotBool withinComment = FALSE;
  4624. InputText = textbuf;
  4625. LgBuffer = 0;
  4626. endOfTextFile = FALSE;
  4627. NumberOfCharRead = 0;
  4628. NumberOfLinesRead = 1;
  4629. CurrentBufChar = 0;
  4630. TtaGetEnvBoolean ("COLOR_SOURCE", &color_source);
  4631. #ifdef ANNOTATIONS
  4632. if (DocumentTypes[doc] == docAnnot)
  4633. /* we search the start of HTML document in the annotation struct */
  4634. parent = ANNOT_GetHTMLRoot (doc, TRUE);
  4635. else
  4636. #endif /* ANNOTATIONS */
  4637. parent = TtaGetRootElement (doc); /* the root element */
  4638. elType = TtaGetElementType (parent);
  4639. el = TtaGetFirstChild (parent); /* first child of the root element */
  4640. if (el == NULL)
  4641. {
  4642. /* insert the Document_URL element */
  4643. elType.ElTypeNum = TextFile_EL_Document_URL;
  4644. prev = TtaNewTree (doc, elType, "");
  4645. TtaInsertFirstChild (&prev, parent, doc);
  4646. /* prevent the user from editing this element */
  4647. TtaSetAccessRight (prev, ReadOnly, doc);
  4648. if (pathURL != NULL && prev != NULL)
  4649. {
  4650. el = TtaGetFirstChild (prev);
  4651. TtaSetTextContent (el, (unsigned char *)pathURL,
  4652. HTMLcontext.language, doc);
  4653. }
  4654. /* insert the BODY element */
  4655. elType.ElTypeNum = TextFile_EL_BODY;
  4656. el = TtaNewElement (doc, elType);
  4657. TtaSetElementLineNumber (el, NumberOfLinesRead);
  4658. if (prev != NULL)
  4659. TtaInsertSibling (el, prev, FALSE, doc);
  4660. else
  4661. TtaInsertFirstChild (&el, parent, doc);
  4662. parent = el;
  4663. }
  4664. prev = el = NULL;
  4665. /* initialize the context */
  4666. HTMLcontext.encoding = TtaGetDocumentCharset (doc);
  4667. HTMLcontext.lastElement = NULL;
  4668. HTMLcontext.lastElementClosed = False;
  4669. HTMLcontext.doc = doc;
  4670. HTMLcontext.mergeText = FALSE;
  4671. HTMLcontext.language = TtaGetDefaultLanguage ();
  4672. attrType.AttrSSchema = TtaGetSSchema ("TextFile", doc);
  4673. /* initialize input buffer */
  4674. charRead = GetNextInputChar (infile, &CurrentBufChar, &endOfTextFile);
  4675. /* read the text file sequentially */
  4676. while (!endOfTextFile)
  4677. {
  4678. if (el == NULL)
  4679. {
  4680. /* create a new line */
  4681. elType.ElTypeNum = TextFile_EL_Line_;
  4682. el = TtaNewTree (doc, elType, "");
  4683. TtaSetElementLineNumber (el, NumberOfLinesRead);
  4684. if (prev != NULL)
  4685. /* new line after the previous */
  4686. TtaInsertSibling (el, prev, FALSE, doc);
  4687. else
  4688. /* first line */
  4689. TtaInsertFirstChild (&el, parent, doc);
  4690. prev = el;
  4691. /* get the text element */
  4692. el = TtaGetFirstChild (el);
  4693. TtaSetElementLineNumber (el, NumberOfLinesRead);
  4694. HTMLcontext.lastElement = el;
  4695. }
  4696. else if (HTMLcontext.lastElement && HTMLcontext.lastElement != el)
  4697. {
  4698. /* one or more symbols were inserted */
  4699. elType.ElTypeNum = TextFile_EL_TEXT_UNIT;
  4700. el = TtaNewElement (doc, elType);
  4701. TtaSetElementLineNumber (el, NumberOfLinesRead);
  4702. TtaInsertSibling (el, HTMLcontext.lastElement, FALSE, doc);
  4703. HTMLcontext.lastElement = el;
  4704. }
  4705. /* Check the character read */
  4706. /* Consider LF and FF as the end of an input line. */
  4707. if ((int) charRead == EOL || (int) charRead == 0)
  4708. {
  4709. /* LF = end of line */
  4710. inputBuffer[LgBuffer] = EOS;
  4711. if (LgBuffer > 0)
  4712. {
  4713. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  4714. LgBuffer = 0;
  4715. attrType.AttrTypeNum = 0;
  4716. if (withinMarkup)
  4717. {
  4718. /* attach the markup attribute */
  4719. attrType.AttrTypeNum = TextFile_ATTR_IsMarkup;
  4720. val = TextFile_ATTR_IsMarkup_VAL_Yes_;
  4721. }
  4722. else if (withinComment)
  4723. {
  4724. /* attach the markup attribute */
  4725. attrType.AttrTypeNum = TextFile_ATTR_IsComment;
  4726. val = TextFile_ATTR_IsComment_VAL_Yes_;
  4727. }
  4728. else if (withinString)
  4729. {
  4730. /* attach the markup attribute */
  4731. attrType.AttrTypeNum = TextFile_ATTR_IsString;
  4732. val = TextFile_ATTR_IsString_VAL_Yes_;
  4733. }
  4734. if (withinMarkup || withinComment || withinString)
  4735. {
  4736. attr = TtaGetAttribute (el, attrType);
  4737. if (attr == NULL)
  4738. {
  4739. attr = TtaNewAttribute (attrType);
  4740. TtaAttachAttribute (el, attr, doc);
  4741. TtaSetAttributeValue (attr, val, el, doc);
  4742. }
  4743. }
  4744. }
  4745. el = NULL; /* generate a new line */
  4746. charRead = EOS;
  4747. }
  4748. if (charRead != EOS)
  4749. {
  4750. /* a valid character has been read */
  4751. if (!color_source)
  4752. {
  4753. if (LgBuffer + 1 >= AllmostFullBuffer)
  4754. {
  4755. /* store the current buffer contents and continue */
  4756. inputBuffer[LgBuffer] = EOS;
  4757. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  4758. LgBuffer = 0;
  4759. inputBuffer[LgBuffer++] = charRead;
  4760. }
  4761. else
  4762. inputBuffer[LgBuffer++] = charRead;
  4763. }
  4764. else if (charRead == '@' && DocumentTypes[doc] == docLog && LgBuffer == 0)
  4765. {
  4766. attrType.AttrTypeNum = TextFile_ATTR_IsLink;
  4767. val = TextFile_ATTR_IsLink_VAL_Yes_;
  4768. attr = TtaNewAttribute (attrType);
  4769. TtaAttachAttribute (el, attr, doc);
  4770. TtaSetAttributeValue (attr, val, el, doc);
  4771. }
  4772. else if (withinMarkup &&
  4773. DocumentTypes[doc] != docCSS &&
  4774. DocumentTypes[doc] != docLog &&
  4775. DocumentTypes[doc] != docText &&
  4776. charRead == '-' &&
  4777. !withinString &&
  4778. LgBuffer > 2 &&
  4779. inputBuffer[LgBuffer-1] == '-' &&
  4780. inputBuffer[LgBuffer-2] == '!' &&
  4781. inputBuffer[LgBuffer-3] == '<')
  4782. {
  4783. /* Start a XML comment */
  4784. withinMarkup = FALSE;
  4785. withinComment = TRUE;
  4786. /* add the current character */
  4787. inputBuffer[LgBuffer++] = charRead;
  4788. }
  4789. else if (withinComment &&
  4790. DocumentTypes[doc] != docCSS &&
  4791. DocumentTypes[doc] != docLog &&
  4792. DocumentTypes[doc] != docText &&
  4793. charRead == '>' &&
  4794. !withinString &&
  4795. LgBuffer > 1 &&
  4796. inputBuffer[LgBuffer-1] == '-' &&
  4797. inputBuffer[LgBuffer-2] == '-')
  4798. {
  4799. /* End a XML comment */
  4800. withinComment = FALSE;
  4801. /* add the current character */
  4802. inputBuffer[LgBuffer++] = charRead;
  4803. /* attach the markup attribute */
  4804. attrType.AttrTypeNum = TextFile_ATTR_IsComment;
  4805. attr = TtaGetAttribute (el, attrType);
  4806. if (attr == NULL)
  4807. {
  4808. attr = TtaNewAttribute (attrType);
  4809. val = TextFile_ATTR_IsComment_VAL_Yes_;
  4810. TtaAttachAttribute (el, attr, doc);
  4811. TtaSetAttributeValue (attr, val, el, doc);
  4812. }
  4813. /* generate a new IsString element */
  4814. el = GetANewText (el, elType, doc);
  4815. }
  4816. else if (!withinQuote && !withinComment &&
  4817. DocumentTypes[doc] != docCSS &&
  4818. DocumentTypes[doc] != docLog &&
  4819. DocumentTypes[doc] != docText &&
  4820. ((charRead == '"' && withinString) ||
  4821. (charRead == '"' && !withinString &&
  4822. LgBuffer > 0 && inputBuffer[LgBuffer-1] == '=') ||
  4823. (LgBuffer == 0 && withinString)))
  4824. {
  4825. /* Start/end a string */
  4826. if (charRead == '"')
  4827. withinString = !withinString;
  4828. if (withinString)
  4829. {
  4830. if (withinMarkup)
  4831. {
  4832. /* attach the markup attribute */
  4833. attrType.AttrTypeNum = TextFile_ATTR_IsMarkup;
  4834. attr = TtaGetAttribute (el, attrType);
  4835. if (attr == NULL)
  4836. {
  4837. attr = TtaNewAttribute (attrType);
  4838. val = TextFile_ATTR_IsMarkup_VAL_Yes_;
  4839. TtaAttachAttribute (el, attr, doc);
  4840. TtaSetAttributeValue (attr, val, el, doc);
  4841. }
  4842. }
  4843. /* generate a new IsString element */
  4844. el = GetANewText (el, elType, doc);
  4845. attrType.AttrTypeNum = TextFile_ATTR_IsString;
  4846. attr = TtaGetAttribute (el, attrType);
  4847. if (attr == NULL)
  4848. {
  4849. attr = TtaNewAttribute (attrType);
  4850. val = TextFile_ATTR_IsString_VAL_Yes_;
  4851. TtaAttachAttribute (el, attr, doc);
  4852. TtaSetAttributeValue (attr, val, el, doc);
  4853. }
  4854. /* add the current character */
  4855. inputBuffer[LgBuffer++] = charRead;
  4856. }
  4857. else
  4858. {
  4859. /* add the current character */
  4860. inputBuffer[LgBuffer++] = charRead;
  4861. /* close the IsString element */
  4862. el = GetANewText (el, elType, doc);
  4863. }
  4864. }
  4865. else if (!withinString && !withinComment &&
  4866. DocumentTypes[doc] != docCSS &&
  4867. DocumentTypes[doc] != docLog &&
  4868. DocumentTypes[doc] != docText &&
  4869. ((charRead == '\'' && withinQuote) ||
  4870. (charRead == '\'' && !withinQuote &&
  4871. LgBuffer > 0 && inputBuffer[LgBuffer-1] == '=') ||
  4872. (LgBuffer == 0 && withinQuote)))
  4873. {
  4874. if (charRead == '\'')
  4875. withinQuote = !withinQuote;
  4876. if (withinQuote)
  4877. {
  4878. /* generate a new IsString element */
  4879. el = GetANewText (el, elType, doc);
  4880. attrType.AttrTypeNum = TextFile_ATTR_IsString;
  4881. attr = TtaGetAttribute (el, attrType);
  4882. if (attr == NULL)
  4883. {
  4884. attr = TtaNewAttribute (attrType);
  4885. val = TextFile_ATTR_IsString_VAL_Yes_;
  4886. TtaAttachAttribute (el, attr, doc);
  4887. TtaSetAttributeValue (attr, val, el, doc);
  4888. }
  4889. /* add the current character */
  4890. inputBuffer[LgBuffer++] = charRead;
  4891. }
  4892. else
  4893. {
  4894. /* add the current character */
  4895. inputBuffer[LgBuffer++] = charRead;
  4896. /* close the IsString element */
  4897. el = GetANewText (el, elType, doc);
  4898. }
  4899. }
  4900. else if (!withinString && !withinQuote &&
  4901. !withinComment &&
  4902. DocumentTypes[doc] != docCSS &&
  4903. DocumentTypes[doc] != docLog &&
  4904. DocumentTypes[doc] != docText &&
  4905. (charRead == '<' ||
  4906. (charRead == '>' && withinMarkup)))
  4907. {
  4908. if (charRead == '<')
  4909. {
  4910. /* Start a markup */
  4911. withinMarkup = TRUE;
  4912. /* close the previous element */
  4913. el = GetANewText (el, elType, doc);
  4914. /* add the current character */
  4915. inputBuffer[LgBuffer++] = charRead;
  4916. }
  4917. else
  4918. {
  4919. /* End a markup */
  4920. withinMarkup = FALSE;
  4921. /* add the current character */
  4922. inputBuffer[LgBuffer++] = charRead;
  4923. attrType.AttrTypeNum = TextFile_ATTR_IsMarkup;
  4924. attr = TtaGetAttribute (el, attrType);
  4925. if (attr == NULL)
  4926. {
  4927. attr = TtaNewAttribute (attrType);
  4928. val = TextFile_ATTR_IsMarkup_VAL_Yes_;
  4929. TtaAttachAttribute (el, attr, doc);
  4930. TtaSetAttributeValue (attr, val, el, doc);
  4931. }
  4932. el = GetANewText (el, elType, doc);
  4933. }
  4934. }
  4935. else if ((DocumentTypes[doc] == docCSS ||
  4936. DocumentTypes[doc] == docText) &&
  4937. (charRead == '*' || charRead == '/' ||
  4938. (LgBuffer == 0 && withinComment)))
  4939. {
  4940. if (!withinComment && charRead == '*' &&
  4941. LgBuffer > 0 && inputBuffer[LgBuffer-1] == '/')
  4942. {
  4943. /* open a comment */
  4944. withinComment = !withinComment;
  4945. /* close previous element */
  4946. inputBuffer[0] = EOS;
  4947. el = GetANewText (el, elType, doc);
  4948. /* add the current character */
  4949. inputBuffer[LgBuffer++] = '/';
  4950. }
  4951. else if ((withinComment && charRead == '*') ||
  4952. (!withinComment && charRead == '/'))
  4953. {
  4954. /* flush the current buffer */
  4955. inputBuffer[LgBuffer] = EOS;
  4956. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  4957. LgBuffer = 0;
  4958. }
  4959. if (withinComment)
  4960. {
  4961. attrType.AttrTypeNum = TextFile_ATTR_IsComment;
  4962. attr = TtaGetAttribute (el, attrType);
  4963. if (attr == NULL)
  4964. {
  4965. attr = TtaNewAttribute (attrType);
  4966. val = TextFile_ATTR_IsComment_VAL_Yes_;
  4967. TtaAttachAttribute (el, attr, doc);
  4968. TtaSetAttributeValue (attr, val, el, doc);
  4969. }
  4970. }
  4971. /* add the current character */
  4972. inputBuffer[LgBuffer++] = charRead;
  4973. if (withinComment && charRead == '/' &&
  4974. LgBuffer == 2 && inputBuffer[0] == '*')
  4975. {
  4976. /* close a comment */
  4977. withinComment = !withinComment;
  4978. el = GetANewText (el, elType, doc);
  4979. }
  4980. }
  4981. else if (LgBuffer + 1 >= AllmostFullBuffer)
  4982. {
  4983. /* store the current buffer contents and continue */
  4984. inputBuffer[LgBuffer] = EOS;
  4985. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  4986. LgBuffer = 0;
  4987. inputBuffer[LgBuffer++] = charRead;
  4988. }
  4989. else
  4990. inputBuffer[LgBuffer++] = charRead;
  4991. if (el != NULL)
  4992. {
  4993. /* test if last created element is a Symbol */
  4994. elType = TtaGetElementType (el);
  4995. if (elType.ElTypeNum != TextFile_EL_TEXT_UNIT)
  4996. {
  4997. /* Create a new text leaf */
  4998. elType.ElTypeNum = TextFile_EL_TEXT_UNIT;
  4999. elLeaf = TtaNewElement (doc, elType);
  5000. TtaSetElementLineNumber (elLeaf, NumberOfLinesRead);
  5001. TtaInsertSibling (elLeaf, el, FALSE, doc);
  5002. el = elLeaf;
  5003. }
  5004. }
  5005. }
  5006. /* read next character from the source */
  5007. charRead = GetNextInputChar (infile, &CurrentBufChar, &endOfTextFile);
  5008. }
  5009. /* close the document */
  5010. if (LgBuffer != 0)
  5011. {
  5012. inputBuffer[LgBuffer] = EOS;
  5013. TtaAppendTextContent (el, (unsigned char *)inputBuffer, doc);
  5014. }
  5015. if (DocumentTypes[doc] != docSource)
  5016. // clean up the list of css files
  5017. UpdateStyleList (doc, 1);
  5018. }
  5019. /*----------------------------------------------------------------------
  5020. CheckDocHeader parses the loaded file to detect if it includes:
  5021. - an XML declaration (returns xmlDec = TRUE)
  5022. - a doctype (returns docType = TRUE)
  5023. Other returns:
  5024. The indicator isXML
  5025. The document type transitional, XHTML 1.1, basic, other (docProfile)
  5026. The charset value if the XML declaration gives an encoding or
  5027. UNDEFINED_CHARSET.
  5028. The type of the doc'ument (given by the first element name)
  5029. A boolean that indicates if an XML DTD is supported by Amaya
  5030. ----------------------------------------------------------------------*/
  5031. void CheckDocHeader (char *fileName, ThotBool *xmlDec, ThotBool *docType,
  5032. ThotBool *isXML, ThotBool *useMath, ThotBool *isknown,
  5033. int *docProfile, CHARSET *charset, char *charsetname,
  5034. DocumentType *thotType, int *extraProfile)
  5035. {
  5036. gzFile stream;
  5037. char *ptr, *beg, *end, *ptrns, *prefix;
  5038. char *buffer = FileBuffer;
  5039. int res, i, j, k, pref_lg = 0;
  5040. ThotBool endOfSniffedFile, beginning;
  5041. ThotBool found;
  5042. *xmlDec = FALSE;
  5043. *docType = FALSE;
  5044. *isXML = FALSE;
  5045. *isknown = FALSE;
  5046. *useMath = FALSE;
  5047. *docProfile = L_Other;
  5048. *charset = UNDEFINED_CHARSET;
  5049. *thotType = docText;
  5050. *extraProfile = L_NoExtraProfile;
  5051. CurrentNameSpace[0] = EOS;
  5052. stream = TtaGZOpen (fileName);
  5053. if (stream != 0)
  5054. {
  5055. InputText = NULL;
  5056. LgBuffer = 0;
  5057. endOfSniffedFile = FALSE;
  5058. beginning = TRUE;
  5059. while (!endOfSniffedFile)
  5060. {
  5061. res = gzread (stream, buffer, INPUT_FILE_BUFFER_SIZE);
  5062. if (res < 0)
  5063. {
  5064. TtaGZClose (stream);
  5065. return;
  5066. }
  5067. if (res >= 5)
  5068. buffer[res] = EOS;
  5069. /* check if the file contains "<?xml ..." */
  5070. i = 0;
  5071. prefix = NULL;
  5072. endOfSniffedFile = (res < INPUT_FILE_BUFFER_SIZE);
  5073. found = TRUE;
  5074. while (found)
  5075. {
  5076. if (beginning)
  5077. {
  5078. /* looks for the first tag */
  5079. while (i < res &&
  5080. (buffer[i] == SPACE ||
  5081. buffer[i] == EOL ||
  5082. buffer[i] == TAB ||
  5083. buffer[i] == __CR__ ||
  5084. (unsigned char) buffer[i] == 0xEF ||
  5085. (unsigned char) buffer[i] == 0xBB ||
  5086. (unsigned char) buffer[i] == 0xBF ))
  5087. i++;
  5088. if (buffer[i] == '<')
  5089. found = TRUE;
  5090. else
  5091. found = FALSE;
  5092. }
  5093. else
  5094. {
  5095. /* looks for the next tag */
  5096. found = FALSE;
  5097. while (!found && i < res)
  5098. if (buffer[i] == '<')
  5099. found = TRUE;
  5100. else
  5101. i++;
  5102. }
  5103. /* if the declaration is present it's the first element */
  5104. if (found)
  5105. {
  5106. if (beginning && !strncmp (&buffer[i], "<?xml ", 6))
  5107. {
  5108. /* we've found <?xml */
  5109. i += 6;
  5110. *xmlDec = TRUE;
  5111. *isXML = TRUE;
  5112. #ifdef XML_GENERIC
  5113. *thotType = docXml;
  5114. #endif /* XML_GENERIC */
  5115. end = strstr (&buffer[i], "?>");
  5116. /* check whether there is an encoding */
  5117. ptr = strstr (&buffer[i], "encoding");
  5118. if (ptr && ptr < end)
  5119. {
  5120. beg = strstr (ptr, "\"");
  5121. if (beg && beg < end)
  5122. end = strstr (&beg[1], "\"");
  5123. else
  5124. {
  5125. beg = strstr (ptr, "\'");
  5126. if (beg && beg < end)
  5127. end = strstr (&beg[1], "\'");
  5128. }
  5129. if (end && beg && end != beg)
  5130. {
  5131. /* get the document charset */
  5132. k = 0; j = 1;
  5133. while (&beg[j] != end && k < MAX_LENGTH)
  5134. charsetname[k++] = beg[j++];
  5135. charsetname[k] = EOS;
  5136. *charset = TtaGetCharset (charsetname);
  5137. }
  5138. }
  5139. }
  5140. else if (!strncasecmp ((char *)&buffer[i], "<!DOCTYPE", 9))
  5141. {
  5142. /* the doctype is found */
  5143. i += 9;
  5144. *docType = TRUE;
  5145. /* it's not necessary to continue */
  5146. found = FALSE;
  5147. endOfSniffedFile = TRUE;
  5148. end = strstr (&buffer[i], ">");
  5149. /* check the current DOCTYPE */
  5150. ptr = strstr (&buffer[i], "HTML");
  5151. if (!ptr || (ptr && ptr > end))
  5152. ptr = strstr (&buffer[i], "html");
  5153. if (ptr && ptr < end)
  5154. {
  5155. *thotType = docHTML;
  5156. *docProfile = L_Transitional;
  5157. ptr = strstr (&buffer[i], "XHTML");
  5158. if (!ptr || (ptr && ptr > end))
  5159. ptr = strstr (&buffer[i], "xhtml");
  5160. if (ptr && ptr < end)
  5161. {
  5162. /* XHTML has been found */
  5163. /* Does Amaya support this doctype */
  5164. *isXML = TRUE;
  5165. *thotType = docXml;
  5166. ptr = strstr (&buffer[i], "Basic 1.0");
  5167. if (!ptr || (ptr && ptr > end))
  5168. ptr = strstr (&buffer[i], "basic 1.0");
  5169. if (ptr && ptr < end)
  5170. {
  5171. *thotType = docHTML;
  5172. *isknown = TRUE;
  5173. *docProfile = L_Basic;
  5174. }
  5175. else
  5176. {
  5177. ptr = strstr (&buffer[i], "XHTML 1.0");
  5178. if (!ptr || (ptr && ptr > end))
  5179. ptr = strstr (&buffer[i], "xhtml 1.0");
  5180. if (!ptr || (ptr && ptr > end))
  5181. ptr = strstr (&buffer[i], "XHTML 1.1");
  5182. if (!ptr || (ptr && ptr > end))
  5183. ptr = strstr (&buffer[i], "xhtml 1.1");
  5184. if (ptr && ptr < end)
  5185. {
  5186. /* A supported XHTML doctype has been found */
  5187. *thotType = docHTML;
  5188. *isknown = TRUE;
  5189. ptr = strstr (&buffer[i], "Strict");
  5190. if (!ptr || (ptr && ptr > end))
  5191. ptr = strstr (&buffer[i], "strict");
  5192. if (ptr && ptr < end)
  5193. *docProfile = L_Strict;
  5194. else
  5195. {
  5196. ptr = strstr (&buffer[i], "Transitional");
  5197. if (!ptr || (ptr && ptr > end))
  5198. ptr = strstr (&buffer[i], "transitional");
  5199. if (ptr && ptr < end)
  5200. *docProfile = L_Transitional;
  5201. else
  5202. {
  5203. ptr = strstr (&buffer[i], "1.1");
  5204. if (ptr && ptr < end)
  5205. {
  5206. ptr = strstr (&buffer[i], "svg:svg");
  5207. if (ptr && ptr < end)
  5208. {
  5209. *thotType = docSVG;
  5210. *isXML = TRUE;
  5211. *isknown = TRUE;
  5212. *docProfile = L_SVG;
  5213. *useMath = TRUE;
  5214. }
  5215. else
  5216. {
  5217. *docProfile = L_Xhtml11;
  5218. ptr = strstr (&buffer[i], "plus MathML");
  5219. if (ptr && ptr < end)
  5220. *useMath = TRUE;
  5221. }
  5222. }
  5223. }
  5224. }
  5225. }
  5226. else
  5227. {
  5228. ptr = strstr (&buffer[i], "xhtml-math11");
  5229. if (!ptr || (ptr && ptr > end))
  5230. ptr = strstr (&buffer[i], "XHTML-MATH11");
  5231. if (ptr && ptr < end)
  5232. {
  5233. *thotType = docHTML;
  5234. *isknown = TRUE;
  5235. *docProfile = L_Xhtml11;
  5236. }
  5237. else
  5238. {
  5239. ptr = strstr (&buffer[i], "+RDFa");
  5240. if (!ptr || (ptr && ptr > end))
  5241. ptr = strstr (&buffer[i], "+rdfa");
  5242. if (ptr && ptr < end)
  5243. {
  5244. *thotType = docHTML;
  5245. *isknown = TRUE;
  5246. *docProfile = L_Xhtml11;
  5247. *extraProfile = L_RDFa;
  5248. }
  5249. }
  5250. }
  5251. }
  5252. }
  5253. else
  5254. {
  5255. ptr = strstr (&buffer[i], "Strict");
  5256. if (!ptr || (ptr && ptr > end))
  5257. ptr = strstr (&buffer[i], "strict");
  5258. if (ptr && ptr < end)
  5259. *docProfile = L_Strict;
  5260. else
  5261. {
  5262. ptr = strstr (&buffer[i], "Transitional");
  5263. if (!ptr || (ptr && ptr > end))
  5264. ptr = strstr (&buffer[i], "transitional");
  5265. if (ptr && ptr < end)
  5266. *docProfile = L_Transitional;
  5267. }
  5268. }
  5269. }
  5270. else
  5271. {
  5272. /* Look for svg tag */
  5273. ptr = strstr (&buffer[i], "SVG");
  5274. if (!ptr || (ptr && ptr > end))
  5275. ptr = strstr (&buffer[i], "svg");
  5276. if (ptr && ptr < end)
  5277. {
  5278. *thotType = docSVG;
  5279. *isXML = TRUE;
  5280. *isknown = TRUE;
  5281. *docProfile = L_SVG;
  5282. }
  5283. else
  5284. {
  5285. /* Look for math tag */
  5286. ptr = strstr (&buffer[i], "MATH");
  5287. if (!ptr || (ptr && ptr > end))
  5288. ptr = strstr (&buffer[i], "math");
  5289. if (ptr && ptr < end)
  5290. {
  5291. *isXML = TRUE;
  5292. *isknown = TRUE;
  5293. *thotType = docMath;
  5294. *docProfile = L_MathML;
  5295. }
  5296. }
  5297. }
  5298. }
  5299. else if (!strncmp (&buffer[i], "<!", 2) ||
  5300. !strncmp (&buffer[i], "<?", 2))
  5301. {
  5302. /* it's a comment or a PI */
  5303. if (!strncmp (&buffer[i], "<!", 2))
  5304. {
  5305. /* look for the end of the comment */
  5306. found = FALSE;
  5307. while (!found && i < res-2)
  5308. if (!strncmp (&buffer[i], "-->", 3))
  5309. found = TRUE;
  5310. else
  5311. i++;
  5312. }
  5313. else
  5314. {
  5315. /* look for the end of the PI */
  5316. found = FALSE;
  5317. while (!found && i < res-1)
  5318. if (!strncmp (&buffer[i], "?>", 2))
  5319. found = TRUE;
  5320. else
  5321. i++;
  5322. }
  5323. if (!found)
  5324. /* the end of the comment or PI can't be found */
  5325. /* it's not necessary to continue */
  5326. endOfSniffedFile = TRUE;
  5327. }
  5328. else if (buffer[i] == '<')
  5329. {
  5330. /* it's most probably a start tag. Is there a
  5331. namespace prefix? */
  5332. i++;
  5333. j = i;
  5334. while (j < res &&
  5335. (buffer[j] != SPACE &&
  5336. buffer[j] != EOL &&
  5337. buffer[j] != TAB &&
  5338. buffer[j] != __CR__ &&
  5339. buffer[j] != ':'))
  5340. j++;
  5341. if (buffer[j] == ':')
  5342. {
  5343. /* there is a prefix, skip it */
  5344. prefix = &buffer[i];
  5345. pref_lg = j - i;
  5346. i = j + 1;
  5347. }
  5348. if (!strncasecmp ((char *)&buffer[i], "html", 4))
  5349. {
  5350. /* the html tag is found */
  5351. i += 4;
  5352. /* it's not necessary to continue */
  5353. found = FALSE;
  5354. endOfSniffedFile = TRUE;
  5355. end = strstr (&buffer[i], ">");
  5356. ptrns = strstr (&buffer[i], "xmlns");
  5357. if (ptrns)
  5358. while (ptrns && ptrns < end)
  5359. {
  5360. *isXML = TRUE;
  5361. ptrns += 5;
  5362. if (*ptrns != ':' ||
  5363. (prefix && !strncmp (&ptrns[1], (const char*)prefix, pref_lg)))
  5364. {
  5365. ptr = strstr (ptrns, XHTML_URI);
  5366. if (ptr && ptr < end)
  5367. {
  5368. /* The xhtml namespace declaration is found */
  5369. *thotType = docHTML;
  5370. *isknown = TRUE;
  5371. *docProfile = L_Transitional;
  5372. }
  5373. ptrns = NULL;
  5374. }
  5375. else
  5376. ptrns = strstr (ptrns, "xmlns");
  5377. }
  5378. else
  5379. /* No namespace, we consider the document as an html one */
  5380. *thotType = docHTML;
  5381. }
  5382. else if (!strncasecmp ((char *)&buffer[i], "svg", 3))
  5383. {
  5384. /* the svg tag is found */
  5385. i += 3;
  5386. /* it's not necessary to continue */
  5387. found = FALSE;
  5388. endOfSniffedFile = TRUE;
  5389. /* We consider the document as a svg one */
  5390. *thotType = docSVG;
  5391. *docProfile = L_SVG;
  5392. end = strstr (&buffer[i], ">");
  5393. ptrns = strstr (&buffer[i], "xmlns");
  5394. while (ptrns && ptrns < end)
  5395. {
  5396. *isXML = TRUE;
  5397. ptrns += 5;
  5398. if (*ptrns != ':' ||
  5399. (prefix && !strncmp (&ptrns[1], (const char*)prefix, pref_lg)))
  5400. {
  5401. ptr = strstr (ptrns, "svg");
  5402. if (ptr && ptr < end)
  5403. {
  5404. /* The svg namespace declaration is found */
  5405. *isknown = TRUE;
  5406. }
  5407. ptrns = NULL;
  5408. }
  5409. else
  5410. ptrns = strstr (ptrns, "xmlns");
  5411. }
  5412. }
  5413. else if (!strncasecmp ((char *)&buffer[i], "math", 4))
  5414. {
  5415. /* the math tag is found */
  5416. i += 4;
  5417. /* it's not necessary to continue */
  5418. found = FALSE;
  5419. endOfSniffedFile = TRUE;
  5420. /* We consider the document as a mathml one */
  5421. *thotType = docMath;
  5422. *docProfile = L_MathML;
  5423. end = strstr (&buffer[i], ">");
  5424. ptrns = strstr (&buffer[i], "xmlns");
  5425. while (ptrns && ptrns < end)
  5426. {
  5427. *isXML = TRUE;
  5428. ptrns += 5;
  5429. if (*ptrns != ':' ||
  5430. (prefix && !strncmp (&ptrns[1], (const char*)prefix, pref_lg)))
  5431. {
  5432. ptr = strstr (ptrns, "MathML");
  5433. if (ptr && ptr < end)
  5434. /* The MathML namespace declaration is found */
  5435. *isknown = TRUE;
  5436. ptrns = NULL;
  5437. }
  5438. else
  5439. ptrns = strstr (ptrns, "xmlns");
  5440. }
  5441. }
  5442. else if (!strncasecmp ((char *)&buffer[i], "library", 7))
  5443. {
  5444. /* the library tag is found */
  5445. i += 7;
  5446. /* it's not necessary to continue */
  5447. found = FALSE;
  5448. endOfSniffedFile = TRUE;
  5449. /* We consider the document as a mathml one */
  5450. *thotType = docTemplate;
  5451. end = strstr (&buffer[i], ">");
  5452. ptrns = strstr (&buffer[i], "xmlns");
  5453. while (ptrns && ptrns < end)
  5454. {
  5455. *isXML = TRUE;
  5456. ptrns += 5;
  5457. if (*ptrns != ':' ||
  5458. (prefix && !strncmp (&ptrns[1], (const char*)prefix, pref_lg)))
  5459. {
  5460. ptr = strstr (ptrns, "xtiger");
  5461. if (ptr && ptr < end)
  5462. /* The xtiger namespace declaration is found */
  5463. *isknown = TRUE;
  5464. ptrns = NULL;
  5465. }
  5466. else
  5467. ptrns = strstr (ptrns, "xmlns");
  5468. }
  5469. }
  5470. else
  5471. {
  5472. /* it's not necessary to continue */
  5473. found = FALSE;
  5474. endOfSniffedFile = TRUE;
  5475. /* We consider the document as a xml one */
  5476. end = strstr (&buffer[i], ">");
  5477. ptrns = strstr (&buffer[i], "xmlns");
  5478. while (ptrns && ptrns < end)
  5479. {
  5480. *thotType = docXml;
  5481. ptrns += 5;
  5482. if (*ptrns != ':' ||
  5483. (prefix && !strncmp (&ptrns[1], (const char*)prefix, pref_lg)))
  5484. {
  5485. // copy the namespace
  5486. while (ptrns != end && *ptrns != '"')
  5487. ptrns++;
  5488. ptr = &ptrns[1];
  5489. j = 0;
  5490. while (ptr != end && *ptr != *ptrns && j < NAME_LENGTH)
  5491. {
  5492. CurrentNameSpace[j++] = *ptr;
  5493. ptr++;
  5494. }
  5495. ptrns = NULL;
  5496. }
  5497. else
  5498. ptrns = strstr (ptrns, "xmlns");
  5499. }
  5500. if (CurrentNameSpace[0] == EOS)
  5501. {
  5502. // copy the root name
  5503. j = 0;
  5504. ptr = &buffer[i];
  5505. while (ptr != end && *ptr != SPACE && j < NAME_LENGTH)
  5506. {
  5507. CurrentNameSpace[j++] = *ptr;
  5508. ptr++;
  5509. }
  5510. }
  5511. }
  5512. }
  5513. else
  5514. {
  5515. /* it's not a comment nor a PI nor a start tag */
  5516. /* stop sniffing */
  5517. found = FALSE;
  5518. endOfSniffedFile = TRUE;
  5519. }
  5520. }
  5521. else
  5522. /* it's not necessary to continue */
  5523. endOfSniffedFile = TRUE;
  5524. /* we're no longer parsing the beginning of the file */
  5525. beginning = FALSE;
  5526. }
  5527. }
  5528. TtaGZClose (stream);
  5529. }
  5530. // allow RDFa for XHTML documents without doctype
  5531. if ((*docProfile == L_Xhtml11 || *docProfile == L_Transitional) &&
  5532. *isXML && !(*docType))
  5533. *extraProfile = L_RDFa;
  5534. }
  5535. /*----------------------------------------------------------------------
  5536. CheckCharsetInMeta
  5537. Parses the loaded file to detect if it includes a charset value
  5538. in a META element
  5539. ----------------------------------------------------------------------*/
  5540. void CheckCharsetInMeta (char *fileName, CHARSET *charset, char *charsetname)
  5541. {
  5542. gzFile stream;
  5543. char *ptr, *end, *end2, *meta, *endmeta, *content, *body, *http;
  5544. char *buffer = FileBuffer;
  5545. int res, i, j, k;
  5546. ThotBool endOfSniffedFile;
  5547. *charset = UNDEFINED_CHARSET;
  5548. stream = TtaGZOpen (fileName);
  5549. if (stream != 0)
  5550. {
  5551. InputText = NULL;
  5552. LgBuffer = 0;
  5553. endOfSniffedFile = FALSE;
  5554. while (!endOfSniffedFile)
  5555. {
  5556. res = gzread (stream, buffer, INPUT_FILE_BUFFER_SIZE);
  5557. if (res < 0)
  5558. {
  5559. TtaGZClose (stream);
  5560. return;
  5561. }
  5562. if (res >= 5)
  5563. buffer[res] = EOS;
  5564. i = 0;
  5565. endOfSniffedFile = (res < INPUT_FILE_BUFFER_SIZE);
  5566. /* looks for the first <meta> element */
  5567. meta = (char*)StrCaseStr (&buffer[i], "<meta");
  5568. if (meta)
  5569. {
  5570. endmeta = strstr (meta, ">");
  5571. /* looks for the first "http-equiv" declaration */
  5572. http = (char*)StrCaseStr (meta, "http-equiv");
  5573. if (http && http > endmeta)
  5574. {
  5575. while (endmeta && http > endmeta)
  5576. {
  5577. meta = (char*)StrCaseStr (endmeta, "<meta");
  5578. if (meta)
  5579. endmeta = strstr (meta, ">");
  5580. else
  5581. {
  5582. endmeta = NULL;
  5583. http = NULL;
  5584. }
  5585. }
  5586. }
  5587. if (http)
  5588. {
  5589. /* looks for the "Content-Type" declaration */
  5590. content = (char*)StrCaseStr (meta, "content-type");
  5591. if (content)
  5592. {
  5593. /* check whether there is a charset */
  5594. ptr = (char*)StrCaseStr (meta, "charset");
  5595. if (ptr)
  5596. {
  5597. endOfSniffedFile = TRUE;
  5598. end = NULL;
  5599. ptr = strstr (ptr, "=");
  5600. if (ptr)
  5601. {
  5602. end2 = strstr (&ptr[1], ">");
  5603. if (end2)
  5604. {
  5605. end = strstr (&ptr[1], "\"");
  5606. if (!end || (end && end > end2))
  5607. {
  5608. end = strstr (&ptr[1], "\'");
  5609. if (end && end > end2)
  5610. end = NULL;
  5611. }
  5612. }
  5613. }
  5614. if (end && end != ptr)
  5615. {
  5616. /* get the document charset */
  5617. k = 0; j = 1;
  5618. while (&ptr[j] != end && k < MAX_LENGTH)
  5619. charsetname[k++] = ptr[j++];
  5620. charsetname[k] = EOS;
  5621. *charset = TtaGetCharset (charsetname);
  5622. }
  5623. }
  5624. }
  5625. }
  5626. }
  5627. /* looks for the <body> element */
  5628. if (!endOfSniffedFile)
  5629. {
  5630. body = (char*)StrCaseStr (&buffer[i], "<body");
  5631. if (body)
  5632. endOfSniffedFile = TRUE;
  5633. }
  5634. }
  5635. TtaGZClose (stream);
  5636. }
  5637. }
  5638. /*----------------------------------------------------------------------
  5639. ----------------------------------------------------------------------*/
  5640. static void CheckHeadElements (Element el, Element *elHead,
  5641. Element *elBody, Document doc)
  5642. {
  5643. Element nextEl, rootEl, lastChild;
  5644. ElementType elType;
  5645. /* check all children of the given element */
  5646. el = TtaGetFirstChild (el);
  5647. lastChild = NULL;
  5648. while (el != NULL)
  5649. {
  5650. nextEl = el;
  5651. TtaNextSibling (&nextEl);
  5652. elType = TtaGetElementType (el);
  5653. if (elType.ElTypeNum == HTML_EL_BODY &&
  5654. elType.ElSSchema == DocumentSSchema && *elBody == NULL)
  5655. *elBody = el;
  5656. else if ((elType.ElTypeNum == HTML_EL_TITLE ||
  5657. elType.ElTypeNum == HTML_EL_ISINDEX ||
  5658. elType.ElTypeNum == HTML_EL_BASE ||
  5659. elType.ElTypeNum == HTML_EL_STYLE_ ||
  5660. elType.ElTypeNum == HTML_EL_META ||
  5661. elType.ElTypeNum == HTML_EL_LINK) &&
  5662. elType.ElSSchema == DocumentSSchema)
  5663. /* this element should be a child of HEAD */
  5664. {
  5665. /* create the HEAD element if it does not exist */
  5666. if (*elHead == NULL)
  5667. {
  5668. #ifdef ANNOTATIONS
  5669. if (DocumentTypes[doc] == docAnnot)
  5670. rootEl = ANNOT_GetHTMLRoot (doc, TRUE);
  5671. else
  5672. #endif /* ANNOTATIONS */
  5673. rootEl = TtaGetRootElement (doc);
  5674. elType.ElTypeNum = HTML_EL_HEAD;
  5675. *elHead = TtaNewElement (doc, elType);
  5676. TtaInsertFirstChild (elHead, rootEl, doc);
  5677. }
  5678. else
  5679. {
  5680. elType.ElTypeNum = HTML_EL_TITLE;
  5681. lastChild = TtaSearchTypedElement (elType, SearchInTree, *elHead);
  5682. if (!lastChild)
  5683. lastChild = TtaGetFirstChild (*elHead);
  5684. }
  5685. /* move the element as the last child of the HEAD element */
  5686. TtaRemoveTree (el, doc);
  5687. if (lastChild)
  5688. TtaInsertSibling (el, lastChild, FALSE, doc);
  5689. else
  5690. TtaInsertFirstChild (&el, *elHead, doc);
  5691. lastChild = el;
  5692. }
  5693. /* get next child of the root */
  5694. el = nextEl;
  5695. }
  5696. /* is there a TITLE element in the HEAD ? */
  5697. if (*elHead != NULL)
  5698. {
  5699. elType = TtaGetElementType (*elHead);
  5700. elType.ElTypeNum = HTML_EL_TITLE;
  5701. if (!TtaSearchTypedElement (elType, SearchInTree, *elHead))
  5702. {
  5703. /* create the title */
  5704. lastChild = TtaNewTree (doc, elType, "");
  5705. TtaInsertFirstChild (&lastChild, *elHead, doc);
  5706. }
  5707. }
  5708. }
  5709. /*----------------------------------------------------------------------
  5710. EncloseCharLevelElem
  5711. create a copy of element charEl for all descendants of el which are not
  5712. block level elements.
  5713. ----------------------------------------------------------------------*/
  5714. static void EncloseCharLevelElem (Element el, Element charEl,
  5715. Document doc, ThotBool *done)
  5716. {
  5717. Element child, next, copy, prev, elem;
  5718. ElementType elType;
  5719. if (IsEmptyElement (el))
  5720. return;
  5721. elType = TtaGetElementType (el);
  5722. if (elType.ElTypeNum == HTML_EL_Table_head ||
  5723. elType.ElTypeNum == HTML_EL_C_Head ||
  5724. elType.ElTypeNum == HTML_EL_Horizontal_Rule)
  5725. /* cannot insert any element into a Table_head or Horizontal_Rule */
  5726. return;
  5727. child = TtaGetFirstChild (el);
  5728. if (child == NULL)
  5729. {
  5730. copy = TtaCopyTree (charEl, doc, doc, el);
  5731. TtaInsertFirstChild (&copy, el, doc);
  5732. }
  5733. else
  5734. {
  5735. prev = NULL;
  5736. do
  5737. {
  5738. next = child;
  5739. TtaNextSibling (&next);
  5740. elem = child;
  5741. if (!IsCharacterLevelElement (elem))
  5742. /* create copies of element parent for all descendants of elem */
  5743. {
  5744. EncloseCharLevelElem (elem, charEl, doc, done);
  5745. prev = NULL;
  5746. if (*done)
  5747. next = NULL;
  5748. }
  5749. else
  5750. /* enclose elem in a copy of charEl */
  5751. {
  5752. if (prev != NULL)
  5753. {
  5754. TtaRemoveTree (elem, doc);
  5755. TtaInsertSibling (elem, prev, FALSE, doc);
  5756. }
  5757. else
  5758. {
  5759. elType = TtaGetElementType (charEl);
  5760. copy = TtaCopyTree (charEl, doc, doc, el);
  5761. TtaInsertSibling (copy, elem, TRUE, doc);
  5762. TtaRemoveTree (elem, doc);
  5763. TtaInsertFirstChild (&elem, copy, doc);
  5764. /* do it only once for an Anchor */
  5765. if (elType.ElTypeNum == HTML_EL_Anchor)
  5766. *done = TRUE;
  5767. }
  5768. prev = elem;
  5769. }
  5770. child = next;
  5771. }
  5772. while (child != NULL);
  5773. }
  5774. }
  5775. /*----------------------------------------------------------------------
  5776. MergeElements
  5777. merge element old into element element el.
  5778. ----------------------------------------------------------------------*/
  5779. static void MergeElements (Element old, Element el, Document doc)
  5780. {
  5781. Element elem, next, prev, sibling;
  5782. elem = TtaGetFirstChild (old);
  5783. sibling = TtaGetFirstChild (el);
  5784. prev = NULL;
  5785. while (elem != NULL)
  5786. {
  5787. next = elem;
  5788. TtaNextSibling (&next);
  5789. TtaRemoveTree (elem, doc);
  5790. if (prev != NULL)
  5791. TtaInsertSibling (elem, prev, FALSE, doc);
  5792. else
  5793. if (sibling == NULL)
  5794. TtaInsertFirstChild (&elem, el, doc);
  5795. else
  5796. TtaInsertSibling (elem, sibling, TRUE, doc);
  5797. prev = elem;
  5798. elem = next;
  5799. }
  5800. TtaDeleteTree (old, doc);
  5801. }
  5802. /*----------------------------------------------------------------------
  5803. MergePseudoParagraph
  5804. if element el is a pseudo-paragraph and its neighbours elements are also
  5805. pseudo paragraphs, merge these elements into a single pseudo-paragraph.
  5806. ----------------------------------------------------------------------*/
  5807. static void MergePseudoParagraph (Element el, Document doc)
  5808. {
  5809. Element prev, next;
  5810. ElementType elType;
  5811. elType = TtaGetElementType (el);
  5812. if (elType.ElTypeNum == HTML_EL_Pseudo_paragraph)
  5813. {
  5814. prev = el;
  5815. TtaPreviousSibling (&prev);
  5816. if (prev != NULL)
  5817. {
  5818. elType = TtaGetElementType (prev);
  5819. if (elType.ElTypeNum == HTML_EL_Pseudo_paragraph)
  5820. /* previous sibling is a pseud-paragraph too */
  5821. MergeElements (prev, el, doc);
  5822. }
  5823. next = el;
  5824. TtaNextSibling (&next);
  5825. if (next != NULL)
  5826. {
  5827. elType = TtaGetElementType (next);
  5828. if (elType.ElTypeNum == HTML_EL_Pseudo_paragraph)
  5829. /* next sibling is a pseud-paragraph too */
  5830. MergeElements (el, next, doc);
  5831. }
  5832. }
  5833. }
  5834. /*----------------------------------------------------------------------
  5835. CheckBlocksInCharElem
  5836. handle character-level elements which contain block-level elements
  5837. ----------------------------------------------------------------------*/
  5838. void CheckBlocksInCharElem (Document doc)
  5839. {
  5840. Element el, parent, child, first, last, next, copy;
  5841. Element newparent, elem, prev, firstNotCharElem;
  5842. PtrElemToBeChecked elTBC, nextElTBC, TBC;
  5843. ElementType elType, parentType;
  5844. ThotBool done;
  5845. /* check all block-level elements whose parent
  5846. was a character-level element */
  5847. elTBC = FirstElemToBeChecked;
  5848. while (elTBC)
  5849. {
  5850. el = elTBC->Elem;
  5851. while (el != NULL)
  5852. {
  5853. parent = TtaGetParent (el);
  5854. if (parent == NULL)
  5855. el = NULL;
  5856. else if (!IsCharacterLevelElement (parent))
  5857. {
  5858. MergePseudoParagraph (el, doc);
  5859. el = NULL;
  5860. }
  5861. else
  5862. {
  5863. parentType = TtaGetElementType (parent);
  5864. elType = TtaGetElementType (el);
  5865. firstNotCharElem = NULL;
  5866. if (elType.ElTypeNum == HTML_EL_Pseudo_paragraph)
  5867. {
  5868. first = TtaGetFirstChild (el);
  5869. child = first;
  5870. last = NULL;
  5871. /* move the pseudo paragraph as sibling of parent*/
  5872. TtaRemoveTree (el, doc);
  5873. TtaInsertSibling (el, parent, TRUE, doc);
  5874. /* move all children of element el as children of parent */
  5875. do
  5876. {
  5877. next = child;
  5878. TtaNextSibling (&next);
  5879. /* register the next element to be checked */
  5880. if (firstNotCharElem == NULL &&
  5881. !IsCharacterLevelElement (child))
  5882. firstNotCharElem = child;
  5883. TtaRemoveTree (child, doc);
  5884. if (child == first)
  5885. TtaInsertFirstChild (&child, parent, doc);
  5886. else
  5887. /* Modif LC 21/06/01 */
  5888. /* Insert the element 'child' after the element
  5889. 'last', not before */
  5890. TtaInsertSibling (child, last, FALSE, doc);
  5891. last = child;
  5892. child = next;
  5893. }
  5894. while (child != NULL);
  5895. elType.ElTypeNum = HTML_EL_Pseudo_paragraph;
  5896. if (TtaGetTypedAncestor (parent, elType))
  5897. {
  5898. /* there is already an ancestor of type pseudo paragraph.
  5899. Delete this one */
  5900. TtaDeleteTree (el, doc);
  5901. el = NULL;
  5902. }
  5903. else
  5904. /* move parent as a child of the pseudo paragraph */
  5905. {
  5906. TtaRemoveTree (parent, doc);
  5907. TtaInsertFirstChild (&parent, el, doc);
  5908. }
  5909. }
  5910. else
  5911. {
  5912. /* move all children of element parent as siblings of el */
  5913. first = TtaGetFirstChild (parent);
  5914. child = first;
  5915. do
  5916. {
  5917. next = child;
  5918. TtaNextSibling (&next);
  5919. TtaRemoveTree (child, doc);
  5920. TtaInsertSibling (child, parent, TRUE, doc);
  5921. last = child;
  5922. child = next;
  5923. }
  5924. while (child != NULL);
  5925. /* copy the character-level element for all elements that
  5926. have been moved */
  5927. newparent = TtaGetParent (parent);
  5928. elem = first;
  5929. prev = NULL;
  5930. do
  5931. {
  5932. /* if the character level element (parent) is an anchor, don't
  5933. repeat it several times */
  5934. if (elem == last || parentType.ElTypeNum == HTML_EL_Anchor)
  5935. next = NULL;
  5936. else
  5937. {
  5938. next = elem;
  5939. TtaNextSibling (&next);
  5940. }
  5941. elType = TtaGetElementType (elem);
  5942. if (elType.ElTypeNum != HTML_EL_Comment_ &&
  5943. !IsCharacterLevelElement (elem))
  5944. /* This is not a character level element */
  5945. /* create a copy of parent for all decendants of child */
  5946. {
  5947. done = FALSE;
  5948. EncloseCharLevelElem (elem, parent, doc, &done);
  5949. if (done)
  5950. next = NULL;
  5951. prev = NULL;
  5952. /* register the next element to be checked */
  5953. if (firstNotCharElem == NULL)
  5954. firstNotCharElem = elem;
  5955. }
  5956. else
  5957. /* this is a character level element */
  5958. /* enclose elem in a copy of parent element */
  5959. {
  5960. if (prev != NULL)
  5961. {
  5962. TtaRemoveTree (elem, doc);
  5963. TtaInsertSibling (elem, prev, FALSE, doc);
  5964. }
  5965. else
  5966. {
  5967. copy = TtaCopyTree (parent, doc, doc, newparent);
  5968. TtaInsertSibling (copy, elem, TRUE, doc);
  5969. TtaRemoveTree (elem, doc);
  5970. TtaInsertFirstChild (&elem, copy, doc);
  5971. }
  5972. prev = elem;
  5973. }
  5974. elem = next;
  5975. }
  5976. while (elem != NULL);
  5977. /* delete the old character-level element */
  5978. TtaDeleteTree (parent, doc);
  5979. }
  5980. /* if, among the elements that have just been moved, there are
  5981. pseudo-paragraphs which are now children of a block element,
  5982. remove these pseudo-paragraphs */
  5983. elem = firstNotCharElem;
  5984. if (firstNotCharElem)
  5985. {
  5986. parent = TtaGetParent (firstNotCharElem);
  5987. if (parent != NULL && !IsBlockElement (parent))
  5988. elem = NULL;
  5989. }
  5990. while (elem != NULL)
  5991. {
  5992. if (elem == last)
  5993. next = NULL;
  5994. else
  5995. {
  5996. next = elem;
  5997. TtaNextSibling (&next);
  5998. }
  5999. elType = TtaGetElementType (elem);
  6000. if (elType.ElTypeNum == HTML_EL_Pseudo_paragraph)
  6001. {
  6002. child = TtaGetFirstChild (elem);
  6003. do
  6004. {
  6005. next = child;
  6006. TtaNextSibling (&next);
  6007. TtaRemoveTree (child, doc);
  6008. TtaInsertSibling (child, elem, TRUE, doc);
  6009. child = next;
  6010. }
  6011. while (child != NULL);
  6012. if (elem == el)
  6013. el = NULL;
  6014. /* if this element is in the queue, remove it from the queue */
  6015. TBC = elTBC->nextElemToBeChecked;
  6016. while (TBC != NULL)
  6017. {
  6018. if (TBC->Elem == elem)
  6019. TBC->Elem = NULL;
  6020. TBC = TBC->nextElemToBeChecked;
  6021. }
  6022. TtaDeleteTree (elem, doc);
  6023. }
  6024. elem = next;
  6025. }
  6026. /* if el is a Pseudo-paragraph, merge it with its next or previous
  6027. siblings if they also are Pseudo-paragraphs */
  6028. if (el != NULL)
  6029. MergePseudoParagraph (el, doc);
  6030. }
  6031. }
  6032. nextElTBC = elTBC->nextElemToBeChecked;
  6033. TtaFreeMemory (elTBC);
  6034. elTBC = nextElTBC;
  6035. }
  6036. FirstElemToBeChecked = NULL;
  6037. LastElemToBeChecked = NULL;
  6038. }
  6039. /*----------------------------------------------------------------------
  6040. ParentOfType
  6041. Return the parent element of element el if it is an HTML element of type
  6042. typeNum. Ignore elements from the Templates namespace.
  6043. ----------------------------------------------------------------------*/
  6044. static Element ParentOfType (Element el, int typeNum)
  6045. {
  6046. ElementType elType;
  6047. Element parent;
  6048. parent = TtaGetParent (el);
  6049. elType = TtaGetElementType (parent);
  6050. #ifdef TEMPLATES
  6051. /* if the parent is a Template element, skip it */
  6052. while (strcmp(TtaGetSSchemaName(elType.ElSSchema),"Template") == 0)
  6053. {
  6054. parent = TtaGetParent (parent);
  6055. elType = TtaGetElementType (parent);
  6056. }
  6057. #endif /* TEMPLATES */
  6058. if (elType.ElTypeNum != typeNum)
  6059. parent = NULL;
  6060. else if (strcmp(TtaGetSSchemaName(elType.ElSSchema),"HTML") != 0)
  6061. parent = NULL;
  6062. return parent;
  6063. }
  6064. /*----------------------------------------------------------------------
  6065. CheckAbstractTree
  6066. Check the Thot abstract tree and create the missing elements.
  6067. The parameter isXTiger is TRUE when parsing a XTiger template.
  6068. ----------------------------------------------------------------------*/
  6069. void CheckAbstractTree (Document doc, ThotBool isXTiger)
  6070. {
  6071. ElementType elType, newElType, headElType;
  6072. Element elRoot, glossary, list, elText, previous;
  6073. Element el, elHead, elBody, elFrameset, elNoframes, nextEl, newEl;
  6074. Element prevEl, lastChild, firstTerm, lastTerm, termList, child;
  6075. Element parent, firstDef, lastDef, defList, firstEntry, lastEntry;
  6076. ThotBool ok, moved;
  6077. SSchema htmlSSchema;
  6078. /* the root HTML element only accepts elements HEAD, BODY, FRAMESET
  6079. Comment and PI as children */
  6080. elHead = NULL;
  6081. elBody = NULL;
  6082. elFrameset = NULL;
  6083. elNoframes = NULL;
  6084. htmlSSchema = TtaGetSSchema ("HTML", doc);
  6085. #ifdef ANNOTATIONS
  6086. if (DocumentTypes[doc] == docAnnot)
  6087. /* we search the start of HTML document in the annotation struct */
  6088. elRoot = ANNOT_GetHTMLRoot (doc, FALSE);
  6089. else
  6090. #endif /* ANNOTATIONS */
  6091. elRoot = TtaGetRootElement (doc);
  6092. if (!elRoot)
  6093. /* there is no <html> element! Create one */
  6094. {
  6095. /* create a <html> element */
  6096. elType.ElSSchema = htmlSSchema;
  6097. elType.ElTypeNum = HTML_EL_HTML;
  6098. elRoot = TtaNewElement (doc, elType);
  6099. /* insert it as the first child of the Document node */
  6100. el = TtaGetMainRoot (doc);
  6101. TtaInsertFirstChild (&elRoot, el, doc);
  6102. /* move all other children of the Document node within this
  6103. new <html> element */
  6104. el = elRoot;
  6105. TtaNextSibling (&el);
  6106. lastChild = NULL;
  6107. while (el)
  6108. {
  6109. nextEl = el;
  6110. TtaNextSibling (&nextEl);
  6111. TtaRemoveTree (el, doc);
  6112. if (!lastChild)
  6113. TtaInsertFirstChild (&el, elRoot, doc);
  6114. else
  6115. TtaInsertSibling (el, lastChild, FALSE, doc);
  6116. lastChild = el;
  6117. el = nextEl;
  6118. }
  6119. }
  6120. el = TtaGetFirstChild (elRoot);
  6121. if (el != NULL)
  6122. {
  6123. elType = TtaGetElementType (el);
  6124. /* skip Comments, PI and Invalid_elements */
  6125. while (el != NULL && elType.ElSSchema == htmlSSchema &&
  6126. (elType.ElTypeNum == HTML_EL_Comment_ ||
  6127. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6128. elType.ElTypeNum == HTML_EL_XMLPI))
  6129. {
  6130. TtaNextSibling (&el);
  6131. if (el != NULL)
  6132. elType = TtaGetElementType (el);
  6133. }
  6134. if (elType.ElTypeNum == HTML_EL_HTML && elType.ElSSchema == htmlSSchema)
  6135. /* that's the HTML root element */
  6136. {
  6137. elRoot = el;
  6138. /* check its children elements */
  6139. el = TtaGetFirstChild (elRoot);
  6140. elType = TtaGetElementType (el);
  6141. /* skip Comments, PI and Invalid_elements */
  6142. while (el != NULL && elType.ElSSchema == htmlSSchema &&
  6143. (elType.ElTypeNum == HTML_EL_Comment_ ||
  6144. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6145. elType.ElTypeNum == HTML_EL_XMLPI))
  6146. {
  6147. TtaNextSibling (&el);
  6148. if (el != NULL)
  6149. elType = TtaGetElementType (el);
  6150. }
  6151. }
  6152. if (elType.ElTypeNum == HTML_EL_HEAD && elType.ElSSchema == htmlSSchema)
  6153. /* the first child of the root element is HEAD */
  6154. {
  6155. elHead = el;
  6156. TtaNextSibling (&el);
  6157. if (el != NULL)
  6158. elType = TtaGetElementType (el);
  6159. }
  6160. else
  6161. {
  6162. elType.ElSSchema = htmlSSchema;
  6163. elType.ElTypeNum = HTML_EL_HEAD;
  6164. elHead = TtaSearchTypedElement (elType, SearchForward, elRoot);
  6165. if (elHead != NULL)
  6166. /* an element HEAD has been found */
  6167. {
  6168. /* move the HEAD element before the current element */
  6169. TtaRemoveTree (elHead, doc);
  6170. TtaInsertSibling (elHead, el, TRUE, doc);
  6171. }
  6172. }
  6173. /* skip Comments, PI and Invalid_elements */
  6174. while (el != NULL && elType.ElSSchema == htmlSSchema &&
  6175. (elType.ElTypeNum == HTML_EL_Comment_ ||
  6176. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6177. elType.ElTypeNum == HTML_EL_XMLPI))
  6178. {
  6179. TtaNextSibling (&el);
  6180. if (el != NULL)
  6181. elType = TtaGetElementType (el);
  6182. }
  6183. if (el != NULL)
  6184. {
  6185. if (elType.ElTypeNum == HTML_EL_HTML &&
  6186. elType.ElSSchema == htmlSSchema)
  6187. elRoot = el;
  6188. else if (elType.ElTypeNum == HTML_EL_BODY &&
  6189. elType.ElSSchema == htmlSSchema)
  6190. /* this child of the root element is BODY */
  6191. elBody = el;
  6192. }
  6193. /* check all children of the root element */
  6194. CheckHeadElements (elRoot, &elHead, &elBody, doc);
  6195. if (elBody != NULL)
  6196. CheckHeadElements (elBody, &elHead, &elBody, doc);
  6197. if (elHead == NULL)
  6198. /* there is no HEAD element. Create one */
  6199. {
  6200. newElType.ElSSchema = htmlSSchema;
  6201. newElType.ElTypeNum = HTML_EL_HEAD;
  6202. elHead = TtaNewTree (doc, newElType, "");
  6203. TtaInsertFirstChild (&elHead, elRoot, doc);
  6204. }
  6205. if (elHead != NULL)
  6206. {
  6207. headElType = TtaGetElementType (elHead);
  6208. /* create a Document_URL element as the first child of HEAD */
  6209. newElType.ElSSchema = htmlSSchema;
  6210. newElType.ElTypeNum = HTML_EL_Document_URL;
  6211. el = TtaGetFirstChild (elHead);
  6212. if (el != NULL)
  6213. {
  6214. elType = TtaGetElementType (el);
  6215. if (elType.ElTypeNum == newElType.ElTypeNum &&
  6216. elType.ElSSchema == newElType.ElSSchema)
  6217. /* element Document_URL already exists */
  6218. elText = TtaGetFirstChild (el);
  6219. else
  6220. el = NULL;
  6221. }
  6222. if (el == NULL)
  6223. /* there is no Document_URL element */
  6224. {
  6225. el = TtaNewElement (doc, newElType);
  6226. TtaInsertFirstChild (&el, elHead, doc);
  6227. newElType.ElTypeNum = HTML_EL_TEXT_UNIT;
  6228. elText = TtaNewElement (doc, newElType);
  6229. TtaInsertFirstChild (&elText, el, doc);
  6230. }
  6231. TtaSetAccessRight (el, ReadOnly, doc);
  6232. if (DocumentURLs[doc] != NULL && elText != NULL)
  6233. TtaSetTextContent (elText,(unsigned char *) DocumentURLs[doc], HTMLcontext.language, doc);
  6234. /* check all chidren of the HEAD Element, except the first one */
  6235. /* which is Document_URL */
  6236. TtaNextSibling (&el);
  6237. lastChild = NULL;
  6238. while (el != NULL)
  6239. {
  6240. nextEl = el;
  6241. TtaNextSibling (&nextEl);
  6242. elType = TtaGetElementType (el);
  6243. /* is this element allowed in the HEAD? */
  6244. if (TtaGetRankInAggregate (elType, headElType) <= 0)
  6245. /* this element is not a valid component of aggregate HEAD */
  6246. #ifdef TEMPLATES
  6247. /* if it is a Template element, accept it */
  6248. if (strcmp(TtaGetSSchemaName(elType.ElSSchema),"Template") != 0)
  6249. #endif /* TEMPLATES */
  6250. /* It may be an SGML inclusion, let's check */
  6251. if (!TtaCanInsertFirstChild (elType, elHead, doc))
  6252. /* this element cannot be a child of HEAD, move it to
  6253. the BODY */
  6254. {
  6255. /* create the BODY element if it does not exist */
  6256. if (elBody == NULL)
  6257. {
  6258. newElType.ElSSchema = htmlSSchema;
  6259. newElType.ElTypeNum = HTML_EL_BODY;
  6260. elBody = TtaNewElement (doc, newElType);
  6261. TtaInsertSibling (elBody, elHead, FALSE, doc);
  6262. }
  6263. /* move the current element into the BODY element */
  6264. TtaRemoveTree (el, doc);
  6265. if (lastChild == NULL)
  6266. TtaInsertFirstChild (&el, elBody, doc);
  6267. else
  6268. TtaInsertSibling (el, lastChild, FALSE, doc);
  6269. lastChild = el;
  6270. }
  6271. el = nextEl;
  6272. }
  6273. }
  6274. /* check the children of the root */
  6275. lastChild = NULL;
  6276. el = TtaGetFirstChild (elRoot);
  6277. previous = elHead;
  6278. moved = FALSE;
  6279. while (el != NULL)
  6280. {
  6281. nextEl = el;
  6282. TtaNextSibling (&nextEl);
  6283. elType = TtaGetElementType (el);
  6284. if (elType.ElTypeNum == HTML_EL_BODY &&
  6285. elType.ElSSchema == htmlSSchema)
  6286. /* stop */
  6287. nextEl = NULL;
  6288. else if (elType.ElTypeNum == HTML_EL_FRAMESET &&
  6289. elType.ElSSchema == htmlSSchema)
  6290. {
  6291. if (elFrameset == NULL)
  6292. elFrameset = el;
  6293. }
  6294. else if (elType.ElTypeNum == HTML_EL_NOFRAMES &&
  6295. elType.ElSSchema == htmlSSchema)
  6296. {
  6297. if (elNoframes == NULL)
  6298. elNoframes = el;
  6299. }
  6300. else if (!moved && elType.ElSSchema == htmlSSchema &&
  6301. (elType.ElTypeNum == HTML_EL_Invalid_element ||
  6302. elType.ElTypeNum == HTML_EL_Comment_ ||
  6303. elType.ElTypeNum == HTML_EL_XMLPI))
  6304. /* don't move Comments, PI and Invalid_elements if the previous
  6305. element has not been moved */
  6306. previous = el;
  6307. else if (elType.ElTypeNum == HTML_EL_HEAD &&
  6308. elType.ElSSchema == htmlSSchema)
  6309. previous = el;
  6310. else if (elType.ElTypeNum != HTML_EL_FRAMESET ||
  6311. elType.ElSSchema != htmlSSchema)
  6312. /* this element should be a child of BODY */
  6313. {
  6314. /* create the BODY element if it does not exist */
  6315. if (elBody == NULL)
  6316. {
  6317. newElType.ElSSchema = htmlSSchema;
  6318. newElType.ElTypeNum = HTML_EL_BODY;
  6319. elBody = TtaNewElement (doc, newElType);
  6320. if (previous == NULL)
  6321. TtaInsertFirstChild (&elBody, elRoot, doc);
  6322. else
  6323. TtaInsertSibling (elBody, previous, FALSE, doc);
  6324. }
  6325. /* move the current element into the BODY element */
  6326. TtaRemoveTree (el, doc);
  6327. if (lastChild == NULL)
  6328. TtaInsertFirstChild (&el, elBody, doc);
  6329. else
  6330. TtaInsertSibling (el, lastChild, FALSE, doc);
  6331. lastChild = el;
  6332. moved = TRUE;
  6333. }
  6334. /* get next child of the root */
  6335. el = nextEl;
  6336. }
  6337. if (elFrameset && elNoframes)
  6338. if (!TtaIsAncestor(elNoframes, elFrameset))
  6339. /* moves the NOFRAMES element within the FRAMESET element */
  6340. {
  6341. el = TtaGetFirstChild(elFrameset);
  6342. previous = NULL;
  6343. while (el)
  6344. {
  6345. previous = el;
  6346. TtaNextSibling (&el);
  6347. }
  6348. TtaRemoveTree (elNoframes, doc);
  6349. if (previous == NULL)
  6350. TtaInsertFirstChild (&elNoframes, elFrameset, doc);
  6351. else
  6352. TtaInsertSibling (elNoframes, previous, FALSE, doc);
  6353. }
  6354. /* handle character-level elements which contain block-level elements*/
  6355. CheckBlocksInCharElem (doc);
  6356. /* create a Term_List element for each sequence of Term elements */
  6357. el = TtaGetFirstChild (elRoot);
  6358. /* search all Term elements in the document */
  6359. while (el != NULL)
  6360. {
  6361. elType.ElSSchema = htmlSSchema;
  6362. elType.ElTypeNum = HTML_EL_Term;
  6363. el = TtaSearchTypedElement (elType, SearchForward, el);
  6364. if (el != NULL)
  6365. /* an element Term has been found */
  6366. {
  6367. /* search all immediate Term siblings, ignoring
  6368. Comments, PI and Invalid_elements */
  6369. firstTerm = el;
  6370. do
  6371. {
  6372. lastTerm = el;
  6373. TtaNextSibling (&el);
  6374. if (el == NULL)
  6375. elType.ElTypeNum = 0;
  6376. else
  6377. elType = TtaGetElementType (el);
  6378. }
  6379. while (elType.ElSSchema == htmlSSchema &&
  6380. (elType.ElTypeNum == HTML_EL_Term ||
  6381. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6382. elType.ElTypeNum == HTML_EL_Comment_ ||
  6383. elType.ElTypeNum == HTML_EL_XMLPI));
  6384. termList = ParentOfType (firstTerm, HTML_EL_Term_List);
  6385. if (!termList)
  6386. {
  6387. /* create a Term_List element before the first
  6388. Term element */
  6389. newElType.ElSSchema = htmlSSchema;
  6390. newElType.ElTypeNum = HTML_EL_Term_List;
  6391. termList = TtaNewElement (doc, newElType);
  6392. TtaInsertSibling (termList, firstTerm, TRUE, doc);
  6393. /* move the Term elements as children of the new Term_List */
  6394. nextEl = firstTerm;
  6395. TtaNextSibling (&nextEl);
  6396. TtaRemoveTree (firstTerm, doc);
  6397. TtaInsertFirstChild (&firstTerm, termList, doc);
  6398. if (lastTerm != firstTerm)
  6399. {
  6400. prevEl = firstTerm;
  6401. do
  6402. {
  6403. child = nextEl;
  6404. TtaNextSibling (&nextEl);
  6405. TtaRemoveTree (child, doc);
  6406. TtaInsertSibling (child, prevEl, FALSE, doc);
  6407. prevEl = child;
  6408. }
  6409. while (nextEl != NULL && child != lastTerm);
  6410. }
  6411. }
  6412. if (!ParentOfType (termList, HTML_EL_Definition_Item))
  6413. {
  6414. /* Create a Definition_Item element surrounding */
  6415. /* the Term_List element */
  6416. newElType.ElSSchema = htmlSSchema;
  6417. newElType.ElTypeNum = HTML_EL_Definition_Item;
  6418. newEl = TtaNewElement (doc, newElType);
  6419. TtaInsertSibling (newEl, termList, TRUE, doc);
  6420. TtaRemoveTree (termList, doc);
  6421. TtaInsertFirstChild (&termList, newEl, doc);
  6422. }
  6423. if (el != NULL)
  6424. {
  6425. elType = TtaGetElementType (el);
  6426. if (elType.ElTypeNum == HTML_EL_Definition &&
  6427. elType.ElSSchema == htmlSSchema)
  6428. /* the following element is a definition */
  6429. {
  6430. /* search all Definition siblings, ignoring
  6431. Comments, PI and Invalid_elements */
  6432. firstDef = el;
  6433. do
  6434. {
  6435. lastDef = el;
  6436. TtaNextSibling (&el);
  6437. if (el == NULL)
  6438. elType.ElTypeNum = 0;
  6439. else
  6440. elType = TtaGetElementType (el);
  6441. }
  6442. while (elType.ElSSchema == htmlSSchema &&
  6443. (elType.ElTypeNum == HTML_EL_Definition ||
  6444. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6445. elType.ElTypeNum == HTML_EL_Comment_ ||
  6446. elType.ElTypeNum == HTML_EL_XMLPI));
  6447. if (!ParentOfType (firstDef, HTML_EL_Definitions))
  6448. {
  6449. /* create a Definitions element after the
  6450. Term_List element */
  6451. newElType.ElSSchema = htmlSSchema;
  6452. newElType.ElTypeNum = HTML_EL_Definitions;
  6453. defList = TtaNewElement (doc, newElType);
  6454. TtaInsertSibling (defList, termList, FALSE, doc);
  6455. /* move the Definitions elements as children of the
  6456. new Definitions element */
  6457. nextEl = firstDef;
  6458. TtaNextSibling (&nextEl);
  6459. TtaRemoveTree (firstDef, doc);
  6460. TtaInsertFirstChild (&firstDef, defList, doc);
  6461. if (lastDef != firstDef)
  6462. {
  6463. prevEl = firstDef;
  6464. do
  6465. {
  6466. child = nextEl;
  6467. TtaNextSibling (&nextEl);
  6468. TtaRemoveTree (child, doc);
  6469. TtaInsertSibling (child, prevEl, FALSE, doc);
  6470. prevEl = child;
  6471. }
  6472. while (nextEl != NULL && child != lastDef);
  6473. }
  6474. }
  6475. }
  6476. }
  6477. /* starting element for the next search of a Term element */
  6478. el = lastTerm;
  6479. }
  6480. }
  6481. /* search all Definition elements without a Definitions parent */
  6482. el = TtaGetFirstChild (elRoot);
  6483. if (el != NULL)
  6484. {
  6485. /* search all Definition elements in the document */
  6486. while (el != NULL)
  6487. {
  6488. elType.ElSSchema = htmlSSchema;
  6489. elType.ElTypeNum = HTML_EL_Definition;
  6490. el = TtaSearchTypedElement (elType, SearchForward, el);
  6491. if (el != NULL)
  6492. /* an element Definition has been found */
  6493. {
  6494. if (!ParentOfType (el, HTML_EL_Definitions))
  6495. /* this Definition is not within a Definitions
  6496. element */
  6497. {
  6498. /* search all Definition siblings, ignoring Comments,
  6499. PIs and Invalid_elements */
  6500. firstDef = el;
  6501. do
  6502. {
  6503. lastDef = el;
  6504. TtaNextSibling (&el);
  6505. if (el == NULL)
  6506. elType.ElTypeNum = 0;
  6507. else
  6508. elType = TtaGetElementType (el);
  6509. }
  6510. while (elType.ElSSchema == htmlSSchema &&
  6511. (elType.ElTypeNum == HTML_EL_Definition ||
  6512. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6513. elType.ElTypeNum == HTML_EL_Comment_ ||
  6514. elType.ElTypeNum == HTML_EL_XMLPI));
  6515. /* create a Definitions element */
  6516. newElType.ElSSchema = htmlSSchema;
  6517. newElType.ElTypeNum = HTML_EL_Definitions;
  6518. defList = TtaNewElement (doc, newElType);
  6519. TtaInsertSibling (defList, firstDef, TRUE, doc);
  6520. TtaRemoveTree (firstDef, doc);
  6521. TtaInsertFirstChild (&firstDef, defList, doc);
  6522. if (!ParentOfType (defList, HTML_EL_Definition_Item))
  6523. /* this Definition is not within a Definition_Item
  6524. element */
  6525. {
  6526. /* create a Definition_Item */
  6527. newElType.ElTypeNum = HTML_EL_Definition_Item;
  6528. newEl = TtaNewElement (doc, newElType);
  6529. TtaInsertSibling (newEl, defList, TRUE, doc);
  6530. TtaRemoveTree (defList, doc);
  6531. TtaInsertFirstChild (&defList, newEl, doc);
  6532. }
  6533. /* move the Definitions elements as children of the
  6534. Definitions element */
  6535. nextEl = firstDef;
  6536. TtaNextSibling (&nextEl);
  6537. if (lastDef != firstDef)
  6538. {
  6539. prevEl = firstDef;
  6540. do
  6541. {
  6542. child = nextEl;
  6543. TtaNextSibling (&nextEl);
  6544. TtaRemoveTree (child, doc);
  6545. TtaInsertSibling (child, prevEl, FALSE, doc);
  6546. prevEl = child;
  6547. }
  6548. while (nextEl != NULL && child != lastDef);
  6549. }
  6550. }
  6551. }
  6552. }
  6553. }
  6554. if (!isXTiger)
  6555. {
  6556. /* create a surrounding element Definition_List for each sequence */
  6557. /* of elements Definition_Item which are not in a Definition_List */
  6558. el = TtaGetFirstChild (elRoot);
  6559. if (el != NULL)
  6560. {
  6561. /* search all elements Definition_Item in the document */
  6562. while (el != NULL)
  6563. {
  6564. elType.ElSSchema = htmlSSchema;
  6565. elType.ElTypeNum = HTML_EL_Definition_Item;
  6566. el = TtaSearchTypedElement (elType, SearchForward, el);
  6567. if (el != NULL)
  6568. /* an element Definition_Item has been found */
  6569. {
  6570. if (!ParentOfType (el, HTML_EL_Definition_List))
  6571. /* this Definition_Item is not a child of a Definition_List*/
  6572. {
  6573. /* search all immediate Definition_Item siblings */
  6574. firstEntry = el;
  6575. do
  6576. {
  6577. lastEntry = el;
  6578. TtaNextSibling (&el);
  6579. if (el == NULL)
  6580. elType.ElTypeNum = 0;
  6581. else
  6582. elType = TtaGetElementType (el);
  6583. }
  6584. while (elType.ElSSchema == htmlSSchema &&
  6585. (elType.ElTypeNum == HTML_EL_Definition_Item ||
  6586. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6587. elType.ElTypeNum == HTML_EL_Comment_ ||
  6588. elType.ElTypeNum == HTML_EL_XMLPI));
  6589. /* create a Definition_List element before the */
  6590. /* first Definition_Item element */
  6591. newElType.ElSSchema = htmlSSchema;
  6592. newElType.ElTypeNum = HTML_EL_Definition_List;
  6593. glossary = TtaNewElement (doc, newElType);
  6594. TtaInsertSibling (glossary, firstEntry, TRUE, doc);
  6595. /* move the Definition_Item elements as children */
  6596. /* of the new Definition_List element */
  6597. nextEl = firstEntry;
  6598. TtaNextSibling (&nextEl);
  6599. TtaRemoveTree (firstEntry, doc);
  6600. TtaInsertFirstChild (&firstEntry, glossary, doc);
  6601. if (lastEntry != firstEntry)
  6602. {
  6603. prevEl = firstEntry;
  6604. do
  6605. {
  6606. child = nextEl;
  6607. TtaNextSibling (&nextEl);
  6608. TtaRemoveTree (child, doc);
  6609. TtaInsertSibling (child, prevEl, FALSE, doc);
  6610. prevEl = child;
  6611. }
  6612. while (nextEl != NULL && child != lastEntry);
  6613. }
  6614. /* starting element for the next search of a */
  6615. /* Definition_Item */
  6616. el = lastEntry;
  6617. }
  6618. }
  6619. }
  6620. }
  6621. /* create a surrounding element Unnumbered_List for each sequence */
  6622. /* of elements List_Item which are not in a Unnumbered_List, a */
  6623. /* Numbered_List, a Menu, or a Directory */
  6624. el = TtaGetFirstChild (elRoot);
  6625. if (el != NULL)
  6626. {
  6627. /* search all elements List_Item in the document */
  6628. do
  6629. {
  6630. elType.ElSSchema = htmlSSchema;
  6631. elType.ElTypeNum = HTML_EL_List_Item;
  6632. el = TtaSearchTypedElement (elType, SearchForward, el);
  6633. if (el != NULL)
  6634. /* an element List_Item has been found */
  6635. {
  6636. parent = TtaGetParent (el);
  6637. elType = TtaGetElementType (parent);
  6638. #ifdef TEMPLATES
  6639. /* if its parent is a Template element, skip it */
  6640. while (strcmp(TtaGetSSchemaName(elType.ElSSchema),"Template") == 0)
  6641. {
  6642. parent = TtaGetParent (parent);
  6643. elType = TtaGetElementType (parent);
  6644. }
  6645. #endif /* TEMPLATES */
  6646. if (elType.ElSSchema != htmlSSchema ||
  6647. (elType.ElTypeNum != HTML_EL_Unnumbered_List &&
  6648. elType.ElTypeNum != HTML_EL_Numbered_List &&
  6649. elType.ElTypeNum != HTML_EL_Menu &&
  6650. elType.ElTypeNum != HTML_EL_Directory))
  6651. /* this List_Item is not within a list */
  6652. {
  6653. /* search all immediate List_Item siblings */
  6654. firstEntry = el;
  6655. do
  6656. {
  6657. lastEntry = el;
  6658. TtaNextSibling (&el);
  6659. if (el == NULL)
  6660. elType.ElTypeNum = 0;
  6661. else
  6662. elType = TtaGetElementType (el);
  6663. }
  6664. while (elType.ElSSchema == htmlSSchema &&
  6665. (elType.ElTypeNum == HTML_EL_List_Item ||
  6666. elType.ElTypeNum == HTML_EL_Invalid_element ||
  6667. elType.ElTypeNum == HTML_EL_Comment_ ||
  6668. elType.ElTypeNum == HTML_EL_XMLPI));
  6669. /* create a HTML_EL_Unnumbered_List element before
  6670. the first List_Item element */
  6671. newElType.ElSSchema = htmlSSchema;
  6672. newElType.ElTypeNum = HTML_EL_Unnumbered_List;
  6673. list = TtaNewElement (doc, newElType);
  6674. TtaInsertSibling (list, firstEntry, TRUE, doc);
  6675. /* move the List_Item elements as children of */
  6676. /* the new HTML_EL_Unnumbered_List element */
  6677. nextEl = firstEntry;
  6678. TtaNextSibling (&nextEl);
  6679. TtaRemoveTree (firstEntry, doc);
  6680. TtaInsertFirstChild (&firstEntry, list, doc);
  6681. if (lastEntry != firstEntry)
  6682. {
  6683. prevEl = firstEntry;
  6684. do
  6685. {
  6686. child = nextEl;
  6687. TtaNextSibling (&nextEl);
  6688. TtaRemoveTree (child, doc);
  6689. TtaInsertSibling (child, prevEl, FALSE, doc);
  6690. prevEl = child;
  6691. }
  6692. while (nextEl != NULL && child != lastEntry);
  6693. }
  6694. /* starting element for the next search of a
  6695. List_Item */
  6696. el = lastEntry;
  6697. }
  6698. }
  6699. }
  6700. while (el);
  6701. }
  6702. }
  6703. /* merge sibling Text elements with same attributes */
  6704. el = elRoot;
  6705. elType.ElSSchema = htmlSSchema;
  6706. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  6707. /* search all TEXT elements in the document */
  6708. while (el != NULL)
  6709. {
  6710. /* search the next TEXT element in the abstract tree */
  6711. el = TtaSearchTypedElement (elType, SearchForward, el);
  6712. if (el != NULL)
  6713. /* a Text element has been found. Try to merge it with its */
  6714. /* following siblings */
  6715. do
  6716. ok = TtaMergeText (el, doc);
  6717. while (ok);
  6718. }
  6719. #ifdef IV
  6720. /* checks all MAP elements. If they are within a Block element, */
  6721. /* move them up in the structure */
  6722. el = elRoot;
  6723. elType.ElSSchema = htmlSSchema;
  6724. elType.ElTypeNum = HTML_EL_map;
  6725. /* search all MAP elements in the document */
  6726. while (el)
  6727. {
  6728. /* search the next MAP element in the abstract tree */
  6729. el = TtaSearchTypedElement (elType, SearchForward, el);
  6730. if (el != NULL)
  6731. /* a MAP element has been found. */
  6732. {
  6733. parent = TtaGetParent(el);
  6734. #ifdef TEMPLATES
  6735. /* if its parent is a Template element, skip it */
  6736. elType = TtaGetElementType (parent);
  6737. while (strcmp(TtaGetSSchemaName(elType.ElSSchema),"Template") == 0)
  6738. {
  6739. parent = TtaGetParent (parent);
  6740. elType = TtaGetElementType (parent);
  6741. }
  6742. #endif /* TEMPLATES */
  6743. if (IsBlockElement (parent))
  6744. /* its parent is a block element */
  6745. {
  6746. TtaRemoveTree (el, doc);
  6747. TtaInsertSibling (el, parent, TRUE, doc);
  6748. }
  6749. }
  6750. }
  6751. #endif /* IV */
  6752. /* If element BODY is empty, create an empty element as a placeholder*/
  6753. if (elBody)
  6754. {
  6755. el = TtaGetFirstChild (elBody);
  6756. elType = TtaGetElementType (el);
  6757. while (el && TtaHasNotElementException(TtaGetElementType (el)))
  6758. TtaNextSibling (&el);
  6759. if (el == NULL)
  6760. {
  6761. newElType.ElSSchema = htmlSSchema;
  6762. newElType.ElTypeNum = HTML_EL_Element;
  6763. newEl = TtaNewElement (doc, newElType);
  6764. TtaInsertFirstChild (&newEl, elBody, doc);
  6765. }
  6766. }
  6767. /* add additional checking here */
  6768. }
  6769. }
  6770. /*----------------------------------------------------------------------
  6771. InitializeHTMLParser
  6772. initializes variables and stack for parsing file
  6773. the parser will insert the thot tree after or as a child
  6774. of last elem, in the document doc.
  6775. If last elem is NULL or doc=0, the parser doesn't initialize
  6776. the stack
  6777. ----------------------------------------------------------------------*/
  6778. static void InitializeHTMLParser (Element lastelem, ThotBool isclosed,
  6779. Document doc)
  6780. {
  6781. char tag[20];
  6782. Element elem;
  6783. int i;
  6784. SSchema schema;
  6785. StackLevel = 1;
  6786. HTMLcontext.language = TtaGetDefaultLanguage ();
  6787. HTMLcontext.parsingTextArea = FALSE;
  6788. HTMLcontext.parsingScript = FALSE;
  6789. HTMLcontext.parsingCSS = FALSE;
  6790. if (lastelem != NULL && doc != 0)
  6791. {
  6792. /* initialize the stack with ancestors of lastelem */
  6793. HTMLcontext.doc = doc;
  6794. DocumentSSchema = TtaGetDocumentSSchema (HTMLcontext.doc);
  6795. #ifdef ANNOTATIONS
  6796. if (DocumentTypes[doc] == docAnnot)
  6797. rootElement = ANNOT_GetHTMLRoot (doc, TRUE);
  6798. else
  6799. #endif /* ANNOTATIONS */
  6800. rootElement = TtaGetMainRoot (HTMLcontext.doc);
  6801. if (isclosed)
  6802. elem = TtaGetParent (lastelem);
  6803. else
  6804. elem = lastelem;
  6805. while (elem != NULL && elem != rootElement && StackLevel < MaxStack-2)
  6806. {
  6807. strcpy ((char *)tag, (char *)GetXMLElementName (TtaGetElementType (elem), doc));
  6808. if (strcmp (tag, "???"))
  6809. {
  6810. for (i = StackLevel; i > 0; i--)
  6811. {
  6812. GINumberStack[i + 1] = GINumberStack[i];
  6813. ElementStack[i + 1] = ElementStack[i];
  6814. LanguageStack[i + 1] = LanguageStack[i];
  6815. ThotLevel[i + 1] = ThotLevel[i] + 1;
  6816. }
  6817. schema = DocumentSSchema;
  6818. GINumberStack[1] = MapGI ((char *)tag, &schema, HTMLcontext.doc);
  6819. ElementStack[1] = elem;
  6820. ThotLevel[1] = 1;
  6821. LanguageStack[1] = HTMLcontext.language;
  6822. StackLevel++;
  6823. }
  6824. elem = TtaGetParent (elem);
  6825. }
  6826. HTMLcontext.lastElement = lastelem;
  6827. HTMLcontext.lastElementClosed = isclosed;
  6828. }
  6829. else
  6830. {
  6831. HTMLcontext.lastElement = rootElement;
  6832. HTMLcontext.lastElementClosed = FALSE;
  6833. }
  6834. NumberOfCharRead = 0;
  6835. NumberOfLinesRead = 1;
  6836. /* input file is supposed to be HTML */
  6837. GINumberStack[0] = -1;
  6838. ElementStack[0] = rootElement;
  6839. ThotLevel[0] = 1;
  6840. lastElemEntry = 0;
  6841. /* initialize input buffer */
  6842. EmptyLine = TRUE;
  6843. StartOfFile = TRUE;
  6844. inputBuffer[0] = EOS;
  6845. LgBuffer = 0;
  6846. lastAttribute = NULL;
  6847. lastAttrElement = NULL;
  6848. lastAttrEntry = NULL;
  6849. UnknownAttr = FALSE;
  6850. ReadingAnAttrValue = FALSE;
  6851. LgEntityName = 0;
  6852. EntityTableEntry = 0;
  6853. CharRank = 0;
  6854. HTMLcontext.mergeText = FALSE;
  6855. AfterTagPRE = FALSE;
  6856. HTMLcontext.parsingCSS = FALSE;
  6857. CurrentBufChar = 0;
  6858. }
  6859. /*----------------------------------------------------------------------
  6860. ParseIncludedHTML
  6861. ----------------------------------------------------------------------*/
  6862. void ParseIncludedHTML (Element elem, char *closingTag)
  6863. {
  6864. Element oldLastElement;
  6865. ThotBool oldLastElementClosed;
  6866. int oldLastElemEntry;
  6867. oldLastElement = HTMLcontext.lastElement;
  6868. HTMLcontext.lastElement = elem;
  6869. oldLastElementClosed = HTMLcontext.lastElementClosed;
  6870. HTMLcontext.lastElementClosed = FALSE;
  6871. oldLastElemEntry = lastElemEntry;
  6872. lastElemEntry = 0;
  6873. HTMLrootClosingTag = closingTag;
  6874. /* TODO: the XML parser must call that function with two new parameters:
  6875. the current infile and current index */
  6876. HTMLparse ((FILE*)stream, NULL);
  6877. HTMLcontext.lastElement = oldLastElement;
  6878. HTMLcontext.lastElementClosed = oldLastElementClosed;
  6879. lastElemEntry = oldLastElemEntry;
  6880. }
  6881. /*----------------------------------------------------------------------
  6882. ParseSubTree called by tranformation.
  6883. ----------------------------------------------------------------------*/
  6884. void ParseSubTree (char* HTMLbuf, Element lastelem, Language language,
  6885. ThotBool isclosed, Document doc)
  6886. {
  6887. ElementType elType;
  6888. char *schemaName;
  6889. docURL = NULL;
  6890. docURL2 = NULL;
  6891. elType = TtaGetElementType (lastelem);
  6892. schemaName = TtaGetSSchemaName(elType.ElSSchema);
  6893. if (strcmp (schemaName, "HTML") == 0)
  6894. /* parse an HTML subtree */
  6895. {
  6896. InitializeHTMLParser (lastelem, isclosed, doc);
  6897. /* Initialize the language context with the 'lastelem' language*/
  6898. HTMLcontext.language = language;
  6899. /* transformation files are alway encoded in UTF_8 */
  6900. HTMLcontext.encoding = UTF_8;
  6901. /* We set number line with 0 when we are parsing a sub-tree */
  6902. NumberOfLinesRead = 0;
  6903. HTMLparse (NULL, HTMLbuf);
  6904. /* Handle character-level elements which contain block-level elements */
  6905. TtaSetStructureChecking (FALSE, doc);
  6906. CheckBlocksInCharElem (doc);
  6907. TtaSetStructureChecking (TRUE, doc);
  6908. }
  6909. else
  6910. {
  6911. InputText = HTMLbuf;
  6912. /* InputText = HTMLbuf; */
  6913. CurrentBufChar = 0;
  6914. if (!ParseXmlBuffer (InputText, lastelem, isclosed, doc, language, NULL))
  6915. StopParsing (doc);
  6916. }
  6917. }
  6918. /*-------------------------------------------------------------------------------
  6919. ParseExternalHTMLDoc
  6920. Parse an external HTML document called from an other document
  6921. ------------------------------------------------------------------------------*/
  6922. void ParseExternalHTMLDoc (Document doc, FILE * infile, CHARSET charset, char *extDocURL)
  6923. {
  6924. Element el, oldel;
  6925. int error;
  6926. /* Context initialization */
  6927. HTMLcontext.doc = doc;
  6928. FirstElemToBeChecked = NULL;
  6929. LastElemToBeChecked = NULL;
  6930. HTMLcontext.lastElement = NULL;
  6931. HTMLcontext.lastElementClosed = FALSE;
  6932. lastElemEntry = 0;
  6933. lastAttribute = NULL;
  6934. lastAttrElement = NULL;
  6935. lastAttrEntry = NULL;
  6936. UnknownAttr = FALSE;
  6937. ReadingAnAttrValue = FALSE;
  6938. CommentText = NULL;
  6939. UnknownTag = FALSE;
  6940. HTMLcontext.mergeText = FALSE;
  6941. LgEntityName = 0;
  6942. EntityTableEntry = 0;
  6943. CharRank = 0;
  6944. HTMLcontext.encoding = TtaGetDocumentCharset (doc);
  6945. HTMLcontext.withinTable = 0;
  6946. LastCharInWorkBuffer = 0;
  6947. FileBuffer[0] = EOS;
  6948. HTMLcontext.language = TtaGetDefaultLanguage ();
  6949. DocumentSSchema = TtaGetDocumentSSchema (doc);
  6950. rootElement = TtaGetMainRoot (doc);
  6951. /* delete all element except the root element and its parent document
  6952. element */
  6953. el = TtaGetFirstChild (rootElement);
  6954. while (el != NULL)
  6955. {
  6956. oldel = el;
  6957. TtaNextSibling (&el);
  6958. TtaDeleteTree (oldel, doc);
  6959. }
  6960. docURL = (char*)TtaGetMemory (strlen ((char *)extDocURL) + 1);
  6961. strcpy ((char *)docURL, (char *)extDocURL);
  6962. /* Check if it's a valid encoding */
  6963. if (DocumentMeta[doc]->charset)
  6964. {
  6965. charset = TtaGetCharset (DocumentMeta[doc]->charset);
  6966. if (charset != UTF_8 && charset != ISO_8859_1 &&
  6967. charset != ISO_8859_2 && charset != ISO_8859_3 &&
  6968. charset != ISO_8859_4 && charset != ISO_8859_5 &&
  6969. charset != ISO_8859_6 && charset != ISO_8859_7 &&
  6970. charset != ISO_8859_8 && charset != ISO_8859_9 &&
  6971. charset != ISO_8859_15 && charset != KOI8_R &&
  6972. charset != WINDOWS_1250 && charset != WINDOWS_1251 &&
  6973. charset != WINDOWS_1252 && charset != WINDOWS_1253 &&
  6974. charset != WINDOWS_1254 && charset != WINDOWS_1255 &&
  6975. charset != WINDOWS_1256 && charset != WINDOWS_1257 &&
  6976. charset != US_ASCII && charset != SHIFT_JIS &&
  6977. charset != ISO_2022_JP && charset != EUC_JP &&
  6978. charset != SHIFT_JIS && charset != GB_2312)
  6979. HTMLParseError (doc,
  6980. TtaGetMessage (AMAYA, AM_UNKNOWN_ENCODING), 0);
  6981. }
  6982. /* parse the input file and build the external document */
  6983. /* initialize parsing environment */
  6984. InitializeHTMLParser (NULL, FALSE, 0);
  6985. HTMLparse (infile, NULL);
  6986. /* completes all unclosed elements */
  6987. el = HTMLcontext.lastElement;
  6988. while (el != NULL)
  6989. {
  6990. XhtmlElementComplete (&HTMLcontext, el, &error);
  6991. el = TtaGetParent (el);
  6992. }
  6993. if (docURL)
  6994. {
  6995. TtaFreeMemory (docURL);
  6996. docURL = NULL;
  6997. }
  6998. DocumentSSchema = NULL;
  6999. HTMLcontext.doc = 0;
  7000. CleanUpParsingErrors ();
  7001. return;
  7002. }
  7003. /*-------------------------------------------------------------------------------
  7004. ClearHTMLParser
  7005. Clear all parser variables
  7006. ------------------------------------------------------------------------------*/
  7007. void ClearHTMLParser ()
  7008. {
  7009. PtrElemToBeChecked elTBC;
  7010. /* clean up the list of ElemToBeChecked */
  7011. elTBC = FirstElemToBeChecked;
  7012. while (FirstElemToBeChecked)
  7013. {
  7014. LastElemToBeChecked = FirstElemToBeChecked->nextElemToBeChecked;
  7015. TtaFreeMemory (FirstElemToBeChecked);
  7016. FirstElemToBeChecked = LastElemToBeChecked;
  7017. }
  7018. FirstElemToBeChecked = NULL;
  7019. LastElemToBeChecked = NULL;
  7020. lastElemEntry = 0;
  7021. lastAttribute = NULL;
  7022. lastAttrElement = NULL;
  7023. lastAttrEntry = NULL;
  7024. UnknownAttr = FALSE;
  7025. ReadingAnAttrValue = FALSE;
  7026. CommentText = NULL;
  7027. UnknownTag = FALSE;
  7028. LastCharInWorkBuffer = 0;
  7029. FileBuffer[0] = EOS;
  7030. LgEntityName = 0;
  7031. EntityTableEntry = 0;
  7032. CharRank = 0;
  7033. }
  7034. /*-------------------------------------------------------------------------------
  7035. StartParser
  7036. Loads the file Directory/htmlFileName for displaying the document documentName.
  7037. The parameter pathURL gives the original (local or distant)
  7038. path or URL of the html document.
  7039. ------------------------------------------------------------------------------*/
  7040. void StartParser (Document doc, char *fileName,
  7041. const char *documentName, char* documentDirectory,
  7042. const char *pathURL, ThotBool plainText, ThotBool external_doc)
  7043. {
  7044. DisplayMode dispMode;
  7045. CHARSET charset;
  7046. Element el, oldel, root;
  7047. AttributeType attrType;
  7048. Attribute attr;
  7049. char *s;
  7050. char tempname[MAX_LENGTH];
  7051. char temppath[MAX_LENGTH];
  7052. ThotBool isHTML;
  7053. int error;
  7054. HTMLcontext.doc = doc;
  7055. FirstElemToBeChecked = NULL;
  7056. LastElemToBeChecked = NULL;
  7057. HTMLcontext.lastElement = NULL;
  7058. HTMLcontext.lastElementClosed = FALSE;
  7059. lastElemEntry = 0;
  7060. lastAttribute = NULL;
  7061. lastAttrElement = NULL;
  7062. lastAttrEntry = NULL;
  7063. UnknownAttr = FALSE;
  7064. ReadingAnAttrValue = FALSE;
  7065. CommentText = NULL;
  7066. UnknownTag = FALSE;
  7067. HTMLcontext.mergeText = FALSE;
  7068. HTMLcontext.withinTable = 0;
  7069. LastCharInWorkBuffer = 0;
  7070. FileBuffer[0] = EOS;
  7071. LgEntityName = 0;
  7072. EntityTableEntry = 0;
  7073. CharRank = 0;
  7074. HTMLcontext.encoding = TtaGetDocumentCharset (doc);
  7075. stream = TtaGZOpen (fileName);
  7076. if (stream != 0)
  7077. {
  7078. if (documentName[0] == EOS && !TtaCheckDirectory (documentDirectory))
  7079. {
  7080. strcpy ((char *)documentName, (char *)documentDirectory);
  7081. documentDirectory[0] = EOS;
  7082. s = TtaGetEnvString ("PWD");
  7083. /* set path on current directory */
  7084. if (s != NULL)
  7085. strcpy ((char *)documentDirectory, (char *)s);
  7086. else
  7087. documentDirectory[0] = EOS;
  7088. }
  7089. TtaAppendDocumentPath (documentDirectory);
  7090. /* Set document URL */
  7091. if (DocumentURLs[doc])
  7092. {
  7093. docURL = (char*)TtaGetMemory (strlen ((char *)DocumentURLs[doc]) + 1);
  7094. strcpy ((char *)docURL, (char *)DocumentURLs[doc]);
  7095. }
  7096. else
  7097. {
  7098. docURL = (char*)TtaGetMemory (strlen ((char *)pathURL) + 1);
  7099. strcpy ((char *)docURL, (char *)pathURL);
  7100. }
  7101. /* Set document URL2 */
  7102. if (docURL)
  7103. {
  7104. docURL2 = (char *)TtaGetMemory (strlen ((char *)docURL) + 1);
  7105. strcpy ((char *)docURL2, (char *)docURL);
  7106. }
  7107. /* do not check the Thot abstract tree against the structure */
  7108. /* schema while building the Thot document. */
  7109. TtaSetStructureChecking (FALSE, doc);
  7110. /* set the notification mode for the new document */
  7111. TtaSetNotificationMode (doc, 1);
  7112. HTMLcontext.language = TtaGetDefaultLanguage ();
  7113. #ifdef ANNOTATIONS
  7114. if (DocumentTypes[doc] == docAnnot)
  7115. {
  7116. /* get the schema associated to the annotation body */
  7117. DocumentSSchema = ANNOT_GetBodySSchema (doc);
  7118. attrType.AttrSSchema = DocumentSSchema;
  7119. }
  7120. else
  7121. #endif /* ANNOTATIONS */
  7122. DocumentSSchema = TtaGetDocumentSSchema (doc);
  7123. /* is the current document a HTML document */
  7124. isHTML = (strcmp (TtaGetSSchemaName (DocumentSSchema), "HTML") == 0);
  7125. if (plainText)
  7126. {
  7127. #ifdef ANNOTATIONS
  7128. if (DocumentTypes[doc] == docAnnot)
  7129. rootElement = ANNOT_GetHTMLRoot (doc, TRUE);
  7130. else
  7131. #endif /* ANNOTATIONS */
  7132. rootElement = TtaGetRootElement (doc);
  7133. if (DocumentTypes[doc] == docSource || DocumentTypes[doc] == docCSS)
  7134. {
  7135. /* add the attribute Source */
  7136. attrType.AttrSSchema = DocumentSSchema;
  7137. attrType.AttrTypeNum = TextFile_ATTR_Source;
  7138. attr = TtaGetAttribute (rootElement, attrType);
  7139. if (attr == 0)
  7140. {
  7141. attr = TtaNewAttribute (attrType);
  7142. TtaAttachAttribute (rootElement, attr, doc);
  7143. }
  7144. }
  7145. /* add the default attribute PrintURL */
  7146. attrType.AttrSSchema = DocumentSSchema;
  7147. attrType.AttrTypeNum = TextFile_ATTR_PrintURL;
  7148. attr = TtaGetAttribute (rootElement, attrType);
  7149. if (attr == 0)
  7150. {
  7151. attr = TtaNewAttribute (attrType);
  7152. TtaAttachAttribute (rootElement, attr, doc);
  7153. }
  7154. }
  7155. else
  7156. {
  7157. /* Check if it's a valid encoding */
  7158. if (DocumentMeta[doc]->charset)
  7159. {
  7160. charset = TtaGetCharset (DocumentMeta[doc]->charset);
  7161. if (charset != UTF_8 && charset != ISO_8859_1 &&
  7162. charset != ISO_8859_2 && charset != ISO_8859_3 &&
  7163. charset != ISO_8859_4 && charset != ISO_8859_5 &&
  7164. charset != ISO_8859_6 && charset != ISO_8859_7 &&
  7165. charset != ISO_8859_8 && charset != ISO_8859_9 &&
  7166. charset != ISO_8859_15 && charset != KOI8_R &&
  7167. charset != WINDOWS_1250 && charset != WINDOWS_1251 &&
  7168. charset != WINDOWS_1252 && charset != WINDOWS_1253 &&
  7169. charset != WINDOWS_1254 && charset != WINDOWS_1255 &&
  7170. charset != WINDOWS_1256 && charset != WINDOWS_1257 &&
  7171. charset != US_ASCII && charset != SHIFT_JIS &&
  7172. charset != ISO_2022_JP && charset != EUC_JP &&
  7173. charset != SHIFT_JIS && charset != GB_2312)
  7174. HTMLParseError (doc,
  7175. TtaGetMessage (AMAYA, AM_UNKNOWN_ENCODING), 0);
  7176. }
  7177. if (!isHTML)
  7178. {
  7179. /* change the document type */
  7180. TtaFreeView (doc, 1);
  7181. doc = TtaNewDocument ("HTML", documentName);
  7182. if (TtaGetScreenDepth () > 1)
  7183. TtaSetPSchema (doc, "HTMLP");
  7184. else
  7185. TtaSetPSchema (doc, "HTMLPBW");
  7186. DocumentSSchema = TtaGetDocumentSSchema (doc);
  7187. /* set attribute dir on the Document element. */
  7188. root = TtaGetMainRoot (doc);
  7189. if (root)
  7190. {
  7191. attrType.AttrSSchema = DocumentSSchema;
  7192. attrType.AttrTypeNum = HTML_ATTR_dir;
  7193. attr = TtaNewAttribute (attrType);
  7194. TtaAttachAttribute (root, attr, doc);
  7195. TtaSetAttributeValue (attr, HTML_ATTR_dir_VAL_ltr_, root,
  7196. doc);
  7197. }
  7198. isHTML = TRUE;
  7199. }
  7200. #ifdef ANNOTATIONS
  7201. if (DocumentTypes[doc] == docAnnot)
  7202. rootElement = ANNOT_GetHTMLRoot (doc, FALSE);
  7203. else
  7204. #endif /* ANNOTATIONS */
  7205. rootElement = TtaGetMainRoot (doc);
  7206. /* add the default attribute PrintURL */
  7207. attrType.AttrSSchema = DocumentSSchema;
  7208. attrType.AttrTypeNum = HTML_ATTR_PrintURL;
  7209. attr = TtaGetAttribute (rootElement, attrType);
  7210. if (!attr)
  7211. {
  7212. attr = TtaNewAttribute (attrType);
  7213. TtaAttachAttribute (rootElement, attr, doc);
  7214. }
  7215. if (MapAreas[doc])
  7216. ChangeAttrOnRoot (doc, HTML_ATTR_ShowAreas);
  7217. }
  7218. dispMode = TtaGetDisplayMode (doc);
  7219. if (dispMode != NoComputedDisplay)
  7220. TtaSetDisplayMode (doc, NoComputedDisplay);
  7221. /* delete all element except the root element and its parent document
  7222. element */
  7223. el = TtaGetFirstChild (rootElement);
  7224. while (el != NULL)
  7225. {
  7226. oldel = el;
  7227. TtaNextSibling (&el);
  7228. TtaDeleteTree (oldel, doc);
  7229. }
  7230. /* save the path or URL of the document */
  7231. TtaExtractName (pathURL, temppath, tempname);
  7232. TtaSetDocumentDirectory (doc, temppath);
  7233. // change the type of the root element if needed
  7234. if (isHTML)
  7235. TtaUpdateRootElementType (rootElement, "HTML", doc);
  7236. /* parse the input file and build the Thot document */
  7237. if (plainText)
  7238. ReadTextFile ((FILE*)stream, NULL, doc, pathURL);
  7239. else
  7240. {
  7241. /* initialize parsing environment */
  7242. InitializeHTMLParser (NULL, FALSE, 0);
  7243. HTMLparse ((FILE*)stream, NULL);
  7244. /* completes all unclosed elements */
  7245. el = HTMLcontext.lastElement;
  7246. while (el != NULL)
  7247. {
  7248. XhtmlElementComplete (&HTMLcontext, el, &error);
  7249. el = TtaGetParent (el);
  7250. }
  7251. /* check the Thot abstract tree */
  7252. CheckAbstractTree (HTMLcontext.doc, IsXTiger (documentName));
  7253. // now load the user style sheet
  7254. if (!external_doc)
  7255. {
  7256. LoadUserStyleSheet (doc);
  7257. UpdateStyleList (doc, 1);
  7258. }
  7259. }
  7260. TtaGZClose (stream);
  7261. if (docURL)
  7262. {
  7263. TtaFreeMemory (docURL);
  7264. docURL = NULL;
  7265. }
  7266. if (docURL2)
  7267. {
  7268. TtaFreeMemory (docURL2);
  7269. docURL2 = NULL;
  7270. }
  7271. if (!external_doc)
  7272. TtaSetDisplayMode (doc, dispMode);
  7273. /* check the Thot abstract tree against the structure schema. */
  7274. TtaSetStructureChecking (TRUE, doc);
  7275. DocumentSSchema = NULL;
  7276. }
  7277. TtaSetDocumentUnmodified (doc);
  7278. HTMLcontext.doc = 0;
  7279. }
  7280. /* end of module */