PageRenderTime 52ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/amaya/html2thot.c

https://github.com/pffy/Amaya-Editor
C | 7618 lines | 6067 code | 333 blank | 1218 comment | 1612 complexity | 29dab6b5f25c8433321a1eb755e2bd6a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. *
  3. * (c) COPYRIGHT INRIA and W3C, 1996-2009
  4. * Please first read the full copyright statement in file COPYRIGHT.
  5. *
  6. */
  7. /*
  8. *
  9. * html2thot parses a HTML file and builds the corresponding abstract tree
  10. * for a Thot document of type HTML.
  11. *
  12. * Author: V. Quint
  13. * I. Vatton (W3C/INRIA): XML extension and Unicode
  14. */
  15. #define THOT_EXPORT extern
  16. #include "amaya.h"
  17. #include "css.h"
  18. #include "fetchHTMLname.h"
  19. #include "parser.h"
  20. #include "zlib.h"
  21. #include "AHTURLTools_f.h"
  22. #include "css_f.h"
  23. #include "EDITstyle_f.h"
  24. #include "fetchHTMLname_f.h"
  25. #include "fetchXMLname_f.h"
  26. #include "html2thot_f.h"
  27. #include "HTMLactions_f.h"
  28. #include "HTMLedit_f.h"
  29. #include "HTMLimage_f.h"
  30. #include "HTMLtable_f.h"
  31. #include "HTMLsave_f.h"
  32. #include "init_f.h"
  33. #include "styleparser_f.h"
  34. #include "UIcss_f.h"
  35. #include "XHTMLbuilder_f.h"
  36. #include "Xml2thot_f.h"
  37. #ifdef ANNOTATIONS
  38. #include "annotlib.h"
  39. #include "ANNOTtools_f.h"
  40. #endif /* ANNOTATIONS */
  41. /* tables defined in XHTMLbuilder.c */
  42. extern AttrValueMapping XhtmlAttrValueMappingTable[];
  43. extern XmlEntity XhtmlEntityTable[];
  44. typedef struct _UnicodeFallbackEntry
  45. {
  46. int unicodeVal; /* the Unicode code */
  47. int EightbitCode; /* the corresponding glyph to be used from
  48. the ISO Latin-1 or Symbol character set.
  49. if 0 < EightbitCode < 255, it's the Symbol code for the correct glyph
  50. if 1000 < EightbitCode < 1256, it's the ISO Latin-1 code + 1000 of an
  51. approaching glyph
  52. if 2000 < EightbitCode < 2256, it's the Symbol code + 2000 of an
  53. approaching glyph */
  54. }
  55. UnicodeFallbackEntry;
  56. UnicodeFallbackEntry UnicodeFallbackTable[] =
  57. {
  58. /* This table MUST be ordered according to the first field of each
  59. entry (Unicode code) */
  60. /* OElig */ {338, 1079}, /* latin capital ligature OE, U+0152 ISOlat2 */
  61. /* oelig */ {339, 1111}, /* latin small ligature oe, U+0153 ISOlat2 */
  62. /* Scaron */ {352, 1083}, /* latin capital letter S with caron, U+0160 ISOlat2 */
  63. /* scaron */ {353, 1115}, /* latin small letter s with caron, U+0161 ISOlat2 */
  64. /* Yuml */ {376, 1089}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */
  65. /* fnof */ {402, 166}, /* latin small f with hook = function = florin, U+0192 ISOtech */
  66. /* epsilon */ {603, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
  67. /* circ */ {710, 2217}, /* modifier letter circumflex accent, U+02C6 ISOpub */
  68. /* tilde */ {732, 1126}, /* small tilde, U+02DC ISOdia */
  69. /* hat */ {770, 1094}, /* small tilde, U+02DC ISOdia */
  70. /* UnderBar */ {818, 45}, /* U+0332 */
  71. /* Alpha */ {913, 65}, /* greek capital letter alpha, U+0391 */
  72. /* Beta */ {914, 66}, /* greek capital letter beta, U+0392 */
  73. /* Gamma */ {915, 71}, /* greek capital letter gamma, U+0393 ISOgrk3 */
  74. /* Delta */ {916, 68}, /* greek capital letter delta, U+0394 ISOgrk3 */
  75. /* Epsilon */ {917, 69}, /* greek capital letter epsilon, U+0395 */
  76. /* Zeta */ {918, 90}, /* greek capital letter zeta, U+0396 */
  77. /* Eta */ {919, 72}, /* greek capital letter eta, U+0397 */
  78. /* Theta */ {920, 81}, /* greek capital letter theta, U+0398 ISOgrk3 */
  79. /* Iota */ {921, 73}, /* greek capital letter iota, U+0399 */
  80. /* Kappa */ {922, 75}, /* greek capital letter kappa, U+039A */
  81. /* Lambda */ {923, 76}, /* greek capital letter lambda, U+039B ISOgrk3 */
  82. /* Mu */ {924, 77}, /* greek capital letter mu, U+039C */
  83. /* Nu */ {925, 78}, /* greek capital letter nu, U+039D */
  84. /* Xi */ {926, 88}, /* greek capital letter xi, U+039E ISOgrk3 */
  85. /* Omicron */ {927, 79}, /* greek capital letter omicron, U+039F */
  86. /* Pi */ {928, 80}, /* greek capital letter pi, U+03A0 ISOgrk3 */
  87. /* Rho */ {929, 82}, /* greek capital letter rho, U+03A1 */
  88. /* Sigma */ {931, 83}, /* greek capital letter sigma, U+03A3 ISOgrk3 */
  89. /* Tau */ {932, 84}, /* greek capital letter tau, U+03A4 */
  90. /* Upsilon */ {933, 85}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */
  91. /* Phi */ {934, 70}, /* greek capital letter phi, U+03A6 ISOgrk3 */
  92. /* Chi */ {935, 67}, /* greek capital letter chi, U+03A7 */
  93. /* Psi */ {936, 89}, /* greek capital letter psi, U+03A8 ISOgrk3 */
  94. /* Omega */ {937, 87}, /* greek capital letter omega, U+03A9 ISOgrk3 */
  95. /* alpha */ {945, 97}, /* greek small letter alpha, U+03B1 ISOgrk3 */
  96. /* beta */ {946, 98}, /* greek small letter beta, U+03B2 ISOgrk3 */
  97. /* gamma */ {947, 103}, /* greek small letter gamma, U+03B3 ISOgrk3 */
  98. /* delta */ {948, 100}, /* greek small letter delta, U+03B4 ISOgrk3 */
  99. /* epsilon */ {949, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
  100. /* zeta */ {950, 122}, /* greek small letter zeta, U+03B6 ISOgrk3 */
  101. /* eta */ {951, 104}, /* greek small letter eta, U+03B7 ISOgrk3 */
  102. /* theta */ {952, 113}, /* greek small letter theta, U+03B8 ISOgrk3 */
  103. /* iota */ {953, 105}, /* greek small letter iota, U+03B9 ISOgrk3 */
  104. /* kappa */ {954, 107}, /* greek small letter kappa, U+03BA ISOgrk3 */
  105. /* lambda */ {955, 108}, /* greek small letter lambda, U+03BB ISOgrk3 */
  106. /* mu */ {956, 109}, /* greek small letter mu, U+03BC ISOgrk3 */
  107. /* nu */ {957, 110}, /* greek small letter nu, U+03BD ISOgrk3 */
  108. /* xi */ {958, 120}, /* greek small letter xi, U+03BE ISOgrk3 */
  109. /* omicron */ {959, 111}, /* greek small letter omicron, U+03BF NEW */
  110. /* pi */ {960, 112}, /* greek small letter pi, U+03C0 ISOgrk3 */
  111. /* rho */ {961, 114}, /* greek small letter rho, U+03C1 ISOgrk3 */
  112. /* sigmaf */ {962, 86}, /* greek small letter final sigma, U+03C2 ISOgrk3 */
  113. /* sigma */ {963, 115}, /* greek small letter sigma, U+03C3 ISOgrk3 */
  114. /* tau */ {964, 116}, /* greek small letter tau, U+03C4 ISOgrk3 */
  115. /* upsilon */ {965, 117}, /* greek small letter upsilon, U+03C5 ISOgrk3 */
  116. /* phi */ {966, 106}, /* greek small letter phi, U+03C6 ISOgrk3 */
  117. /* chi */ {967, 99}, /* greek small letter chi, U+03C7 ISOgrk3 */
  118. /* psi */ {968, 121}, /* greek small letter psi, U+03C8 ISOgrk3 */
  119. /* omega */ {969, 119}, /* greek small letter omega, U+03C9 ISOgrk3 */
  120. /* thetasym */ {977, 74}, /* greek small letter theta symbol, U+03D1 NEW */
  121. /* upsih */ {978, 161}, /* greek upsilon with hook symbol, U+03D2 NEW */
  122. /* phiv */ {981, 102}, /* greek U+03D5 ISOgrk3 */
  123. /* piv */ {982, 118}, /* greek pi symbol, U+03D6 ISOgrk3 */
  124. /* ensp */ {8194, 1130}, /* en space, U+2002 ISOpub */
  125. /* emsp */ {8195, 1160}, /* em space, U+2003 ISOpub */
  126. /* thinsp */ {8201, 1129}, /* thin space, U+2009 ISOpub */
  127. /* zwnj */ {8204, 1063}, /* zero width non-joiner, U+200C NEW RFC 2070 */
  128. /* zwj */ {8205, 1063}, /* zero width joiner, U+200D NEW RFC 2070 */
  129. /* lrm */ {8206, 1063}, /* left-to-right mark, U+200E NEW RFC 2070 */
  130. /* rlm */ {8207, 1063}, /* right-to-left mark, U+200F NEW RFC 2070 */
  131. /* ndash */ {8211, 2045}, /* en dash, U+2013 ISOpub */
  132. /* mdash */ {8212, 2190}, /* em dash, U+2014 ISOpub */
  133. /* horbar */ {8213, 190}, /* U+2015 */
  134. /* Verbar */ {8214, 189}, /* U+2016 */
  135. /* lsquo */ {8216, 1096}, /* left single quotation mark, U+2018 ISOnum */
  136. /* rsquo */ {8217, 1039}, /* right single quotation mark, U+2019 ISOnum */
  137. /* sbquo */ {8218, 1044}, /* single low-9 quotation mark, U+201A NEW */
  138. /* ldquo */ {8220, 1096}, /* left double quotation mark, U+201C ISOnum */
  139. /* rdquo */ {8221, 1039}, /* right double quotation mark, U+201D ISOnum */
  140. /* bdquo */ {8222, 1044}, /* double low-9 quotation mark, U+201E NEW */
  141. /* dagger */ {8224, 2042}, /* dagger, U+2020 ISOpub */
  142. /* Dagger */ {8225, 2042}, /* double dagger, U+2021 ISOpub */
  143. /* bull */ {8226, 183}, /* bullet = black small circle, U+2022 ISOpub */
  144. /* hellip */ {8230, 188}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */
  145. /* lre */ {8234, 1063}, /* left-to-right embed, U+202A NEW RFC 2070 */
  146. /* rle */ {8235, 1063}, /* right-to-left embed, U+202B NEW RFC 2070 */
  147. /* pdf */ {8236, 1063}, /* pop directional format, U+202C NEW RFC 2070 */
  148. /* lro */ {8237, 1063}, /* left-to-right override, U+202D NEW RFC 2070 */
  149. /* rlo */ {8238, 1063}, /* right-to-left override, U+202E NEW RFC 2070 */
  150. /* permil */ {8240, 2037}, /* per mille sign, U+2030 ISOtech */
  151. /* prime */ {8242, 162}, /* prime = minutes = feet, U+2032 ISOtech */
  152. /* Prime */ {8243, 178}, /* double prime = seconds = inches, U+2033 ISOtech */
  153. /* lsaquo */ {8249, 1060}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */
  154. /* rsaquo */ {8250, 1062}, /* single right-pointing angle quotation mark, U+203A ISO proposed */
  155. /* oline */ {8254, 1175}, /* overline = spacing overscore, U+203E NEW */
  156. /* frasl */ {8260, 164}, /* fraction slash, U+2044 NEW */
  157. /*ApplyFunction*/ {8289, 1129}, /* thin space, U+2009 ISOpub */
  158. /*InvisibleTimes*/ {8290, 1129}, /* thin space, U+2009 ISOpub */
  159. /*InvisibleComa*/ {8291, 1129}, /* thin space, U+2009 ISOpub */
  160. /* euro */ {8364, 2206}, /* euro sign, U+20AC NEW */
  161. /*TripleDot */ {8411, 188}, /* tdot, U+20DB ISOtech */
  162. /* image */ {8465, 193}, /* blackletter capital I = imaginary part, U+2111 ISOamso */
  163. /* copysf */ {8471, 211}, /* U+2117 */
  164. /* weierp */ {8472, 195}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */
  165. /* real */ {8476, 194}, /* blackletter capital R = real part symbol, U+211C ISOamso */
  166. /* trade */ {8482, 212}, /* trade mark sign, U+2122 ISOnum */
  167. /* alefsym */ {8501, 192}, /* alef symbol = first transfinite cardinal, U+2135 NEW */
  168. /*DifferentialD*/{8518, 1100}, /* U+2146 */
  169. /*ExponentialE*/{8519, 1101},/* */
  170. /*ImaginaryI*/ {8520, 1105},/* */
  171. /* larr */ {8592, 172}, /* leftwards arrow, U+2190 ISOnum */
  172. /* uarr */ {8593, 173}, /* upwards arrow, U+2191 ISOnum*/
  173. /* rarr */ {8594, 174}, /* rightwards arrow, U+2192 ISOnum */
  174. /* darr */ {8595, 175}, /* downwards arrow, U+2193 ISOnum */
  175. /* harr */ {8596, 171}, /* left right arrow, U+2194 ISOamsa */
  176. /* crarr */ {8629, 191}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
  177. /* lrarr */ {8646, 171}, /* U+21C6 */
  178. /* lArr */ {8656, 220}, /* leftwards double arrow, U+21D0 ISOtech */
  179. /* uArr */ {8657, 221}, /* upwards double arrow, U+21D1 ISOamsa */
  180. /* rArr */ {8658, 222}, /* rightwards double arrow, U+21D2 ISOtech */
  181. /* dArr */ {8659, 223}, /* downwards double arrow, U+21D3 ISOamsa */
  182. /* hArr */ {8660, 219}, /* left right double arrow, U+21D4 ISOamsa */
  183. /* forall */ {8704, 34}, /* for all, U+2200 ISOtech */
  184. /* part */ {8706, 182}, /* partial differential, U+2202 ISOtech */
  185. /* exist */ {8707, 36}, /* there exists, U+2203 ISOtech */
  186. /* empty */ {8709, 198}, /* empty set = null set = diameter, U+2205 ISOamso */
  187. /* nabla */ {8711, 209}, /* nabla = backward difference, U+2207 ISOtech */
  188. /* isin */ {8712, 206}, /* element of, U+2208 ISOtech */
  189. /* notin */ {8713, 207}, /* not an element of, U+2209 ISOtech */
  190. /* ni */ {8715, 39}, /* contains as member, U+220B ISOtech */
  191. /* prod */ {8719, 213}, /* n-ary product = product sign, U+220F ISOamsb */
  192. /* sum */ {8721, 229}, /* n-ary sumation, U+2211 ISOamsb */
  193. /* minus */ {8722, 45}, /* minus sign, U+2212 ISOtech */
  194. /* Backslash*/ {8726, 1092},/* U+8726 */
  195. /* lowast */ {8727, 42}, /* asterisk operator, U+2217 ISOtech */
  196. /* radic */ {8730, 214}, /* square root = radical sign, U+221A ISOtech */
  197. /* prop */ {8733, 181}, /* proportional to, U+221D ISOtech */
  198. /* infin */ {8734, 165}, /* infinity, U+221E ISOtech */
  199. /* ang */ {8736, 208}, /* angle, U+2220 ISOamso */
  200. /* VerticalBar*/ {8739, 1124}, /* */
  201. /* parallel */ {8741, 1124}, /* parallel, U+2225 ISOtech */
  202. /* and */ {8743, 217}, /* logical and = wedge, U+2227 ISOtech */
  203. /* or */ {8744, 218}, /* logical or = vee, U+2228 ISOtech */
  204. /* cap */ {8745, 199}, /* intersection = cap, U+2229 ISOtech */
  205. /* cup */ {8746, 200}, /* union = cup, U+222A ISOtech */
  206. /* int */ {8747, 242}, /* integral, U+222B ISOtech */
  207. /* there4 */ {8756, 92}, /* therefore, U+2234 ISOtech */
  208. /* Colon */ {8759, 58}, /* Colon, U+2237 */
  209. /* sim */ {8764, 126}, /* tilde operator = varies with = similar to, U+223C ISOtech */
  210. /*EqualTilde*/ {8770, 64}, /* U+2242 ISOamsr */
  211. /* cong */ {8773, 64}, /* approximately equal to, U+2245 ISOtech */
  212. /* asymp */ {8776, 187}, /* almost equal to = asymptotic to, U+2248 ISOamsr */
  213. /* ne */ {8800, 185}, /* not equal to, U+2260 ISOtech */
  214. /* equiv */ {8801, 186}, /* identical to, U+2261 ISOtech */
  215. /* le */ {8804, 163}, /* less-than or equal to, U+2264 ISOtech */
  216. /* ge */ {8805, 179}, /* greater-than or equal to, U+2265 ISOtech */
  217. /* sub */ {8834, 204}, /* subset of, U+2282 ISOtech */
  218. /* sup */ {8835, 201}, /* superset of, U+2283 ISOtech */
  219. /* nsub */ {8836, 203}, /* not a subset of, U+2284 ISOamsn */
  220. /* sube */ {8838, 205}, /* subset of or equal to, U+2286 ISOtech */
  221. /* supe */ {8839, 202}, /* superset of or equal to, U+2287 ISOtech */
  222. /* subne */ {8842, 203}, /* U+228A */
  223. /* oplus */ {8853, 197}, /* circled plus = direct sum, U+2295 ISOamsb */
  224. /* otimes */ {8855, 196}, /* circled times = vector product, U+2297 ISOamsb */
  225. /* DownTee */ {8868, 94}, /* U+22A4 ISOtech */
  226. /* perp */ {8869, 94}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
  227. /* Vee */ {8897, 218}, /* U+22C1 ISOamsb */
  228. /*Intersection*/ {8898, 199}, /* U+22C2 ISOamsb */
  229. /*Intersection*/ {8899, 200}, /* U+22C3 ISOamsb */
  230. /* Diamond */ {8900, 168}, /* diamond operator, U+22C4 ISOamsb */
  231. /* sdot */ {8901, 215}, /* dot operator, U+22C5 ISOamsb */
  232. /* star */ {8902, 1042},/* */
  233. /* Subset */ {8912, 204}, /* U+22D0 */
  234. /* Cap */ {8914, 199}, /* U+22D2 */
  235. /* Cup */ {8915, 199}, /* U+22D3 */
  236. /* lceil */ {8968, 233}, /* left ceiling = apl upstile, U+2308 ISOamsc */
  237. /* rceil */ {8969, 249}, /* right ceiling, U+2309 ISOamsc */
  238. /* lfloor */ {8970, 235}, /* left floor = apl downstile, U+230A ISOamsc */
  239. /* rfloor */ {8971, 251}, /* right floor, U+230B ISOamsc */
  240. /* lang */ {9001, 225}, /* left-pointing angle bracket = bra, U+2329 ISOtech */
  241. /* rang */ {9002, 241}, /* right-pointing angle bracket = ket, U+232A ISOtech */
  242. /* dtri */ {9663, 209}, /* lozenge, U+25BF */
  243. /* loz */ {9674, 224}, /* lozenge, U+25CA ISOpub */
  244. /* spades */ {9824, 170}, /* black spade suit, U+2660 ISOpub */
  245. /* clubs */ {9827, 167}, /* black club suit = shamrock, U+2663 ISOpub */
  246. /* hearts */ {9829, 169}, /* black heart suit = valentine, U+2665 ISOpub */
  247. /* diams */ {9830, 168}, /* black diamond suit, U+2666 ISOpub */
  248. /* lang */ {0x27E8, 225},/* U+27E8 Mathematical left angle bracket */
  249. /* rang */ {0x27E9, 241},/* U+27E9 Mathematical right angle bracket */
  250. /* And */ {10835, 217}, /* U+2A53 */
  251. /* Or */ {10836, 218}, /* U+2A54 */
  252. /* Equal */ {10869, 1061},/* */
  253. /* Not */ {10988, 216}, /* U+2AEC */
  254. /* OverBrace*/ {65079, 132}, /* U+FE37 */
  255. /*UnderBrace*/ {65080, 133}, /* U+FE38 */
  256. /* THE END */ {0, 0} /* last entry (required) */
  257. };
  258. typedef struct _ElemToBeChecked *PtrElemToBeChecked;
  259. typedef struct _ElemToBeChecked
  260. {
  261. Element Elem; /* the element to be checked */
  262. PtrElemToBeChecked nextElemToBeChecked;
  263. }
  264. ElemToBeChecked;
  265. /* empty elements */
  266. static int EmptyElement[] =
  267. {
  268. HTML_EL_AREA,
  269. HTML_EL_BASE,
  270. HTML_EL_BaseFont,
  271. HTML_EL_BR,
  272. HTML_EL_COL,
  273. HTML_EL_FRAME,
  274. HTML_EL_Horizontal_Rule,
  275. HTML_EL_IMG,
  276. HTML_EL_Input,
  277. HTML_EL_ISINDEX,
  278. HTML_EL_LINK,
  279. HTML_EL_META,
  280. HTML_EL_Parameter,
  281. HTML_EL_PICTURE_UNIT,
  282. 0};
  283. /* character level elements */
  284. static int CharLevelElement[] =
  285. {
  286. HTML_EL_TEXT_UNIT, HTML_EL_PICTURE_UNIT, HTML_EL_SYMBOL_UNIT,
  287. HTML_EL_Anchor,
  288. HTML_EL_Teletype_text, HTML_EL_Italic_text, HTML_EL_Bold_text,
  289. HTML_EL_Underlined_text, HTML_EL_Struck_text, HTML_EL_Big_text,
  290. HTML_EL_Small_text,
  291. HTML_EL_Emphasis, HTML_EL_Strong, HTML_EL_Def, HTML_EL_Code, HTML_EL_Sample,
  292. HTML_EL_Keyboard, HTML_EL_Variable_, HTML_EL_Cite, HTML_EL_ABBR,
  293. HTML_EL_ACRONYM,
  294. HTML_EL_Font_, HTML_EL_Quotation, HTML_EL_Subscript, HTML_EL_Superscript,
  295. HTML_EL_Span, HTML_EL_BDO, HTML_EL_ins, HTML_EL_del,
  296. HTML_EL_IMG, HTML_EL_Input,
  297. HTML_EL_Option, HTML_EL_OptGroup, HTML_EL_Option_Menu,
  298. HTML_EL_Text_Input, HTML_EL_Password_Input, HTML_EL_File_Input,
  299. HTML_EL_Checkbox_Input, HTML_EL_Radio_Input, HTML_EL_Submit_Input,
  300. HTML_EL_Reset_Input, HTML_EL_Hidden_Input, HTML_EL_Inserted_Text,
  301. HTML_EL_Button_Input, HTML_EL_BUTTON_,
  302. HTML_EL_LABEL,
  303. HTML_EL_BR, HTML_EL_ruby,
  304. HTML_EL_Object, HTML_EL_Basic_Elem,
  305. 0};
  306. /* block level elements, i.e. elements having a Line rule in the presentation
  307. schema fo the main view */
  308. static int BlockLevelElement[] =
  309. {
  310. HTML_EL_H1, HTML_EL_H2, HTML_EL_H3, HTML_EL_H4, HTML_EL_H5, HTML_EL_H6,
  311. HTML_EL_Paragraph, HTML_EL_Pseudo_paragraph, HTML_EL_Text_Area,
  312. HTML_EL_Term, HTML_EL_Address, HTML_EL_LEGEND, HTML_EL_CAPTION,
  313. HTML_EL_INS, HTML_EL_DEL, HTML_EL_Division,
  314. 0};
  315. /* start tags that imply the end of a current element */
  316. /* any tag of each line implies the end of the current element if the type of
  317. that element is in the same line */
  318. typedef char oneLine[100];
  319. static oneLine EquivEndingElem[] =
  320. {
  321. "dt dd li option",
  322. "h1 h2 h3 h4 h5 h6",
  323. "address pre listing xmp",
  324. ""
  325. };
  326. /* acording the HTML DTD, HR should be added to the 2nd line above, as it */
  327. /* is not allowed within a H1, H2, H3, etc. But we should tolerate that case */
  328. /* because many documents contain rules in headings... */
  329. /* start tags that imply the end of current element */
  330. static oneLine StartTagEndingElem[] =
  331. {
  332. "form closes form p p* hr h1 h2 h3 h4 h5 h6 dl ul ol menu dir address pre listing xmp head",
  333. "head closes p p*",
  334. "title closes p p*",
  335. "body closes head style script title p p*",
  336. "li closes p p* h1 h2 h3 h4 h5 h6 dl address pre listing xmp head",
  337. "hr closes p p* head",
  338. "h1 closes p p* head",
  339. "h2 closes p p* head",
  340. "h3 closes p p* head",
  341. "h4 closes p p* head",
  342. "h5 closes p p* head",
  343. "h6 closes p p* head",
  344. "dir closes p p* head",
  345. "address closes p p* head ul",
  346. "pre closes p p* head ul",
  347. "listing closes p p* head",
  348. "xmp closes p p* head",
  349. "blockquote closes p p* head",
  350. "dl closes p p* dt menu dir address pre listing xmp head",
  351. "dt closes p p* menu dir address pre listing xmp head",
  352. "dd closes p p* menu dir address pre listing xmp head",
  353. "ul closes p p* head ol menu dir address pre listing xmp",
  354. "ol closes p p* head ul",
  355. "menu closes p p* head ul",
  356. "p closes p p* head h1 h2 h3 h4 h5 h6",
  357. "p* closes p p* head",
  358. "div closes p p* head",
  359. "noscript closes p p* head",
  360. "center closes font b i p p* head",
  361. "a closes a",
  362. "caption closes p p*",
  363. "colgroup closes caption colgroup col p p*",
  364. "col closes caption col p p*",
  365. "table closes p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
  366. "th closes th td",
  367. "td closes th td",
  368. "tr closes th td tr caption col colgroup",
  369. "thead closes caption col colgroup",
  370. "tfoot closes th td tr caption col colgroup thead tbody",
  371. "tbody closes th td tr caption col colgroup thead tfoot tbody",
  372. "optgroup closes option",
  373. "fieldset closes legend p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
  374. ""
  375. };
  376. typedef int State; /* a state of the automaton */
  377. extern int HTML_ENTRIES;
  378. static PtrClosedElement *FirstClosedElem;
  379. /* ---------------------- static variables ---------------------- */
  380. /* parser stack */
  381. #define MaxStack 200 /* maximum stack height */
  382. static int GINumberStack[MaxStack]; /* entry of pHTMLGIMapping */
  383. static Element ElementStack[MaxStack]; /* element in the Thot abstract
  384. tree */
  385. static int ThotLevel[MaxStack]; /* level of element in the Thot
  386. tree */
  387. static Language LanguageStack[MaxStack]; /* element language */
  388. static int StackLevel = 0; /* first free element on the
  389. stack */
  390. /* information about the input file */
  391. #define INPUT_FILE_BUFFER_SIZE 2000
  392. #define PREV_READ_CHARS 30
  393. static char FileBuffer[INPUT_FILE_BUFFER_SIZE+1];
  394. static char PreviousRead[PREV_READ_CHARS+1];
  395. static char *WorkBuffer = FileBuffer;
  396. static int LastCharInWorkBuffer = 0; /* last char. in the buffer */
  397. static int LastCharInPreviousRead = 0;
  398. static int CurrentBufChar = 0; /* current character read */
  399. static int StartOfTagIndx = 0; /* last "<" read */
  400. static int StartOfRead = 0;
  401. static char PreviousBufChar = EOS; /* previous character read */
  402. static char *InputText = NULL;
  403. static gzFile stream = 0;
  404. static int NumberOfLinesRead = 0;/* number of lines read in the
  405. file */
  406. static int NumberOfCharRead = 0; /* number of characters read in the
  407. current line */
  408. static ThotBool EmptyLine = TRUE; /* no printable character encountered
  409. yet in the current line */
  410. static ThotBool StartOfFile = TRUE; /* no printable character encountered
  411. yet in the file */
  412. static ThotBool AfterTagPRE = FALSE; /* <PRE> has just been read */
  413. static char* docURL = NULL; /* path or URL of the document */
  414. static char *docURL2 = NULL; /* save the docURL for some cases of parsing errors */
  415. /* Static variables used for the call to the XML parser */
  416. static ThotBool NotToReadFile = FALSE;
  417. static int PreviousNumberOfLinesRead = 0;
  418. static int PreviousNumberOfCharRead = 0;
  419. /* Boolean that indicates the end of a HTML file */
  420. /* It is a static variable because it is used in parameter */
  421. /* for the call of the new XML parser (EndOfStartGI) */
  422. static ThotBool EndOfHtmlFile;
  423. /* input buffer */
  424. #define MaxBufferLength 1000
  425. #define AllmostFullBuffer 700
  426. #define MaxMsgLength 300 /* maximum size of error messages */
  427. static unsigned char inputBuffer[MaxBufferLength];
  428. static int LgBuffer = 0; /* actual length of text in input
  429. buffer */
  430. static int BufferLineNumber = 0; /* line number in the source file of
  431. the beginning of the text
  432. contained in the buffer */
  433. /* information about the Thot document under construction */
  434. /* global data used by the HTML parser */
  435. static ParserData HTMLcontext = {0, ISO_8859_1, 0, NULL, 0,
  436. FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
  437. static SSchema DocumentSSchema = NULL; /* the HTML structure schema */
  438. static Element rootElement = NULL; /* root element of the document */
  439. static int lastElemEntry = 0; /* index in the pHTMLGIMapping of the
  440. element being created */
  441. static Attribute lastAttribute = NULL; /* last attribute created */
  442. static Attribute lastAttrElement = NULL;/* element with which the last
  443. attribute has been associated */
  444. static AttributeMapping *lastAttrEntry = NULL; /* entry in the AttributeMappingTable
  445. of the attribute being created */
  446. static ThotBool UnknownAttr = FALSE; /* the last attribute encountered is
  447. invalid */
  448. static ThotBool ReadingAnAttrValue = FALSE;
  449. static ThotBool TruncatedAttrValue = FALSE;
  450. static char *BufferAttrValue = NULL;
  451. static int LgBufferAttrValue = 0;
  452. static Element CommentText = NULL; /* TEXT element of the current
  453. Comment element */
  454. static Element ASPText = NULL; /* TEXT element of the current
  455. ASP element */
  456. static Element PIText = NULL; /* TEXT element of the current
  457. ASP element */
  458. static ThotBool UnknownTag = FALSE; /* the last start tag encountered is
  459. invalid */
  460. static ThotBool HTMLrootClosed = FALSE;
  461. static char *HTMLrootClosingTag = NULL;
  462. static PtrElemToBeChecked FirstElemToBeChecked = NULL;
  463. static PtrElemToBeChecked LastElemToBeChecked = NULL;
  464. /* automaton */
  465. static State currentState; /* current state of the automaton */
  466. static State returnState; /* return state from subautomaton */
  467. static ThotBool NormalTransition;
  468. static ThotBool CharProcessed;
  469. /* information about an entity being read */
  470. static char EntityName[MaxEntityLength];/* name of entity being read */
  471. static int LgEntityName = 0; /* length of entity name read so
  472. far */
  473. static int EntityTableEntry = 0; /* entry of the entity table that
  474. matches the entity read so far */
  475. static int CharRank = 0; /* rank of the last matching
  476. character in that entry */
  477. /* second char of an UTF-8 string */
  478. static unsigned char SecondByte[6] = {EOS, EOS, EOS, EOS, EOS, EOS};
  479. static void ProcessStartGI (const char* GIname);
  480. static void EndOfAttrValue (char c);
  481. /*----------------------------------------------------------------------
  482. ----------------------------------------------------------------------*/
  483. static const char *StrCaseStr (const char *str1, const char *str2)
  484. {
  485. char c;
  486. const char *ptr;
  487. int len;
  488. if (str1 == NULL || str2 == NULL)
  489. return NULL;
  490. c = *str2;
  491. len = strlen ((char *)str2);
  492. ptr = str1;
  493. while (*ptr != EOS)
  494. {
  495. if (tolower(*ptr) == c && !strncasecmp ((char *)str2, ptr, len))
  496. return ptr;
  497. else
  498. ptr++;
  499. }
  500. return NULL;
  501. }
  502. /*----------------------------------------------------------------------
  503. ----------------------------------------------------------------------*/
  504. char *SkipSep (char *ptr)
  505. {
  506. while (*ptr == SPACE || *ptr == ',')
  507. ptr++;
  508. return (ptr);
  509. }
  510. /*----------------------------------------------------------------------
  511. ----------------------------------------------------------------------*/
  512. char *SkipInt (char *ptr)
  513. {
  514. while (*ptr != EOS && *ptr != SPACE && *ptr != ',')
  515. ptr++;
  516. return (ptr);
  517. }
  518. /*----------------------------------------------------------------------
  519. ParseAreaCoords
  520. Computes x, y, width and height of the box from the coords attribute value.
  521. ----------------------------------------------------------------------*/
  522. void ParseAreaCoords (Element element, Document document)
  523. {
  524. ElementType elType;
  525. AttributeType attrType;
  526. Attribute attrCoords, attrX, attrY;
  527. Attribute attrW, attrH, attrShape;
  528. char *ptr3, *text;
  529. int x1, y1, x2, y2;
  530. int length, shape, r;
  531. /* Is it an AREA element */
  532. elType = TtaGetElementType (element);
  533. if (elType.ElTypeNum != HTML_EL_AREA)
  534. return;
  535. /* Search the coords attribute */
  536. attrType.AttrSSchema = elType.ElSSchema;
  537. attrType.AttrTypeNum = HTML_ATTR_coords;
  538. attrCoords = TtaGetAttribute (element, attrType);
  539. if (attrCoords == NULL)
  540. return;
  541. /* Search the shape attribute */
  542. attrType.AttrTypeNum = HTML_ATTR_shape;
  543. attrShape = TtaGetAttribute (element, attrType);
  544. if (attrShape == NULL)
  545. /* no shape attribute. Create one with value rectangle */
  546. {
  547. attrShape = TtaNewAttribute (attrType);
  548. TtaAttachAttribute (element, attrShape, document);
  549. shape = HTML_ATTR_shape_VAL_rectangle;
  550. TtaSetAttributeValue (attrShape, shape, element, document);
  551. }
  552. else
  553. shape = TtaGetAttributeValue (attrShape);
  554. length = TtaGetTextAttributeLength (attrCoords);
  555. text = (char*)TtaGetMemory (length + 1);
  556. TtaGiveTextAttributeValue (attrCoords, text, &length);
  557. if (shape == HTML_ATTR_shape_VAL_rectangle ||
  558. shape == HTML_ATTR_shape_VAL_circle)
  559. {
  560. /* Search the x_coord attribute */
  561. attrType.AttrTypeNum = HTML_ATTR_x_coord;
  562. attrX = TtaGetAttribute (element, attrType);
  563. if (attrX == NULL)
  564. {
  565. /* create it */
  566. attrX = TtaNewAttribute (attrType);
  567. TtaAttachAttribute (element, attrX, document);
  568. }
  569. /* Search the y_coord attribute */
  570. attrType.AttrTypeNum = HTML_ATTR_y_coord;
  571. attrY = TtaGetAttribute (element, attrType);
  572. if (attrY == NULL)
  573. {
  574. /* create it */
  575. attrY = TtaNewAttribute (attrType);
  576. TtaAttachAttribute (element, attrY, document);
  577. }
  578. /* Search the width attribute */
  579. attrType.AttrTypeNum = HTML_ATTR_IntWidthPxl;
  580. attrW = TtaGetAttribute (element, attrType);
  581. if (attrW == NULL)
  582. {
  583. /* create it */
  584. attrW = TtaNewAttribute (attrType);
  585. TtaAttachAttribute (element, attrW, document);
  586. }
  587. /* Search the height attribute */
  588. attrType.AttrTypeNum = HTML_ATTR_IntHeightPxl;
  589. attrH = TtaGetAttribute (element, attrType);
  590. if (attrH == NULL)
  591. {
  592. /* create it */
  593. attrH = TtaNewAttribute (attrType);
  594. TtaAttachAttribute (element, attrH, document);
  595. }
  596. if (shape == HTML_ATTR_shape_VAL_rectangle)
  597. {
  598. x1 = x2 = y1 = y2 = 0;
  599. ptr3 = text;
  600. if (ptr3)
  601. sscanf (ptr3, "%d", &x1);
  602. ptr3 = SkipInt (ptr3);
  603. ptr3 = SkipSep (ptr3);
  604. if (ptr3)
  605. sscanf (ptr3, "%d", &y1);
  606. ptr3 = SkipInt (ptr3);
  607. ptr3 = SkipSep (ptr3);
  608. if (ptr3)
  609. sscanf (ptr3, "%d", &x2);
  610. ptr3 = SkipInt (ptr3);
  611. ptr3 = SkipSep (ptr3);
  612. sscanf (ptr3, "%d", &y2);
  613. TtaSetAttributeValue (attrX, x1, element, document);
  614. TtaSetAttributeValue (attrY, y1, element, document);
  615. TtaSetAttributeValue (attrW, x2 - x1, element, document);
  616. TtaSetAttributeValue (attrH, y2 - y1, element, document);
  617. }
  618. else
  619. {
  620. x1 = y1 = r = 0;
  621. ptr3 = text;
  622. if (ptr3)
  623. sscanf (ptr3, "%d", &x1);
  624. ptr3 = SkipInt (ptr3);
  625. ptr3 = SkipSep (ptr3);
  626. if (ptr3)
  627. sscanf (ptr3, "%d", &y1);
  628. ptr3 = SkipInt (ptr3);
  629. ptr3 = SkipSep (ptr3);
  630. if (ptr3)
  631. sscanf (ptr3, "%d", &r);
  632. TtaSetAttributeValue (attrX, x1 - r, element, document);
  633. TtaSetAttributeValue (attrY, y1 - r, element, document);
  634. TtaSetAttributeValue (attrW, 2 * r, element, document);
  635. TtaSetAttributeValue (attrH, 2 * r, element, document);
  636. }
  637. }
  638. else if (shape == HTML_ATTR_shape_VAL_polygon)
  639. {
  640. element = TtaGetFirstChild (element);
  641. length = TtaGetPolylineLength (element);
  642. /* remove previous points */
  643. while (length > 1)
  644. {
  645. TtaDeletePointInPolyline (element, length, document);
  646. length--;
  647. }
  648. length = 1;
  649. ptr3 = text;
  650. /* add new points */
  651. while (*ptr3 != EOS)
  652. {
  653. x1 = y1 = 0;
  654. sscanf (ptr3, "%d", &x1);
  655. ptr3 = SkipInt (ptr3);
  656. ptr3 = SkipSep (ptr3);
  657. if (ptr3)
  658. sscanf (ptr3, "%d", &y1);
  659. ptr3 = SkipInt (ptr3);
  660. ptr3 = SkipSep (ptr3);
  661. TtaAddPointInPolyline (element, length, UnPixel, x1, y1,document,
  662. FALSE);
  663. length++;
  664. }
  665. }
  666. TtaFreeMemory (text);
  667. }
  668. /*----------------------------------------------------------------------
  669. SetLanguagInHTMLStack
  670. Sets the value of the language.
  671. ----------------------------------------------------------------------*/
  672. void SetLanguagInHTMLStack (Language lang)
  673. {
  674. LanguageStack[StackLevel - 1] = lang;
  675. }
  676. /*----------------------------------------------------------------------
  677. IsHtmlParsingCSS
  678. Returns the value of ParsingCSS boolean.
  679. ----------------------------------------------------------------------*/
  680. ThotBool IsHtmlParsingCSS ()
  681. {
  682. return HTMLcontext.parsingCSS;
  683. }
  684. /*----------------------------------------------------------------------
  685. SetHtmlParsingCSS
  686. Sets the value of ParsingCSS boolean.
  687. ----------------------------------------------------------------------*/
  688. void SetHtmlParsingCSS (ThotBool value)
  689. {
  690. HTMLcontext.parsingCSS = value;
  691. }
  692. /*----------------------------------------------------------------------
  693. SetHtmlParsingTextArea
  694. Sets the value of ParsingTextArea boolean.
  695. ----------------------------------------------------------------------*/
  696. void SetHtmlParsingTextArea (ThotBool value)
  697. {
  698. HTMLcontext.parsingTextArea = value;
  699. }
  700. /*----------------------------------------------------------------------
  701. SetHtmlParsingScript
  702. Sets the value of ParsingScript boolean.
  703. ----------------------------------------------------------------------*/
  704. void SetHtmlParsingScript (ThotBool value)
  705. {
  706. HTMLcontext.parsingScript = value;
  707. }
  708. /*----------------------------------------------------------------------
  709. SetHtmlElemLineNumber
  710. Assigns the current line number
  711. ----------------------------------------------------------------------*/
  712. void SetHtmlElemLineNumber (Element el)
  713. {
  714. TtaSetElementLineNumber (el, NumberOfLinesRead);
  715. }
  716. /*----------------------------------------------------------------------
  717. IsWithinHtmlTable
  718. Returns the value of WithinTable integer.
  719. ----------------------------------------------------------------------*/
  720. int IsWithinHtmlTable ()
  721. {
  722. return HTMLcontext.withinTable;
  723. }
  724. /*----------------------------------------------------------------------
  725. copyCEstring create a copy of the string of elements pointed
  726. by first and return a pointer on the first
  727. element of the copy.
  728. ----------------------------------------------------------------------*/
  729. static PtrClosedElement copyCEstring (PtrClosedElement first)
  730. {
  731. PtrClosedElement ret, cur, next, prev;
  732. ret = NULL;
  733. cur = first;
  734. prev = NULL;
  735. while (cur != NULL)
  736. {
  737. next = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  738. next->nextClosedElem = NULL;
  739. next->tagNum = cur->tagNum;
  740. if (ret == NULL)
  741. ret = next;
  742. else
  743. prev->nextClosedElem = next;
  744. prev = next;
  745. cur = cur->nextClosedElem;
  746. }
  747. return ret;
  748. }
  749. /*----------------------------------------------------------------------
  750. InitMapping intialise the list of the elements closed by
  751. each start tag.
  752. ----------------------------------------------------------------------*/
  753. void InitMapping (void)
  754. {
  755. int line;
  756. int entry;
  757. int ptr;
  758. int i;
  759. typeName name;
  760. PtrClosedElement newCE, lastCE, firstCE, curCE;
  761. SSchema schema;
  762. /* building the table */
  763. FirstClosedElem = (PtrClosedElement *)TtaGetMemory (HTML_ENTRIES * sizeof(PtrClosedElement));
  764. for (entry = 0; entry < HTML_ENTRIES; entry++)
  765. FirstClosedElem[entry] = NULL;
  766. /* read table EquivEndingElem */
  767. line = 0;
  768. do
  769. /* read one line of EquivEndingElem */
  770. {
  771. ptr = 0;
  772. lastCE = NULL;
  773. firstCE = NULL;
  774. do
  775. {
  776. /* read one identifier */
  777. i = 0;
  778. while (EquivEndingElem[line][ptr] != SPACE &&
  779. EquivEndingElem[line][ptr] != EOS)
  780. name[i++] = EquivEndingElem[line][ptr++];
  781. name[i] = EOS;
  782. ptr++;
  783. if (i > 0)
  784. /* a identifier has been read */
  785. {
  786. schema = DocumentSSchema;
  787. entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
  788. #ifdef DEBUG
  789. if (entry < 0)
  790. fprintf (stderr, "error in EquivEndingElem: tag %s unknown in line\n%s\n", name, EquivEndingElem[line]);
  791. else
  792. #endif
  793. {
  794. newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  795. newCE->nextClosedElem = NULL;
  796. newCE->tagNum = entry;
  797. if (firstCE == NULL)
  798. firstCE = newCE;
  799. else
  800. lastCE->nextClosedElem = newCE;
  801. lastCE = newCE;
  802. }
  803. }
  804. }
  805. while (EquivEndingElem[line][ptr] != EOS);
  806. /* one line has been read */
  807. curCE = firstCE;
  808. while (curCE != NULL)
  809. {
  810. if (curCE->nextClosedElem == NULL)
  811. newCE = firstCE;
  812. else
  813. newCE = copyCEstring (firstCE);
  814. if (FirstClosedElem[curCE->tagNum] == NULL)
  815. FirstClosedElem[curCE->tagNum] = newCE;
  816. else
  817. {
  818. lastCE = FirstClosedElem[curCE->tagNum];
  819. while (lastCE->nextClosedElem != NULL)
  820. lastCE = lastCE->nextClosedElem;
  821. lastCE->nextClosedElem = newCE;
  822. }
  823. curCE = curCE->nextClosedElem;
  824. }
  825. line++;
  826. }
  827. while (strcmp (EquivEndingElem[line], "") != 0);
  828. /* read table StartTagEndingElem */
  829. line = 0;
  830. do
  831. /* read one line of StartTagEndingElem */
  832. {
  833. ptr = 0;
  834. i = 0;
  835. /* read the first tag name of the line */
  836. while (StartTagEndingElem[line][ptr] != SPACE &&
  837. StartTagEndingElem[line][ptr] != EOS)
  838. name[i++] = StartTagEndingElem[line][ptr++];
  839. name[i] = EOS;
  840. i = 0;
  841. ptr++;
  842. schema = DocumentSSchema;
  843. entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
  844. #ifdef DEBUG
  845. if (entry < 0)
  846. fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
  847. #endif
  848. /* read the keyword "closes" */
  849. while (StartTagEndingElem[line][ptr] != SPACE &&
  850. StartTagEndingElem[line][ptr] != EOS)
  851. name[i++] = StartTagEndingElem[line][ptr++];
  852. name[i] = EOS;
  853. i = 0;
  854. ptr++;
  855. #ifdef DEBUG
  856. if (strcmp (name, "closes") != 0)
  857. fprintf (stderr, "error in StartTagEndingElem: \"%s\" instead of \"closes\" in line\n%s\n", name, StartTagEndingElem[line]);
  858. #endif
  859. lastCE = FirstClosedElem[entry];
  860. if (lastCE != NULL)
  861. while (lastCE->nextClosedElem != NULL)
  862. lastCE = lastCE->nextClosedElem;
  863. do
  864. {
  865. while (StartTagEndingElem[line][ptr] != SPACE &&
  866. StartTagEndingElem[line][ptr] != EOS)
  867. name[i++] = StartTagEndingElem[line][ptr++];
  868. name[i] = EOS;
  869. ptr++;
  870. if (i > 0)
  871. {
  872. i = 0;
  873. newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
  874. newCE->nextClosedElem = NULL;
  875. schema = DocumentSSchema;
  876. newCE->tagNum = MapGI ((char *)name, &schema, HTMLcontext.doc);
  877. #ifdef DEBUG
  878. if (newCE->tagNum < 0)
  879. fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
  880. #endif
  881. if (lastCE == NULL)
  882. FirstClosedElem[entry] = newCE;
  883. else
  884. lastCE->nextClosedElem = newCE;
  885. lastCE = newCE;
  886. }
  887. }
  888. while (StartTagEndingElem[line][ptr] != EOS);
  889. line++;
  890. }
  891. while (strcmp (StartTagEndingElem[line], "") != 0);
  892. }
  893. /*----------------------------------------------------------------------
  894. Within checks if an element of type ThotType is in the stack.
  895. ----------------------------------------------------------------------*/
  896. static ThotBool Within (int ThotType, SSchema ThotSSchema)
  897. {
  898. ThotBool ret;
  899. int i;
  900. ElementType elType;
  901. ret = FALSE;
  902. i = StackLevel - 1;
  903. while (i >= 0 && !ret)
  904. {
  905. if (ElementStack[i] != NULL)
  906. {
  907. elType = TtaGetElementType (ElementStack[i]);
  908. if (elType.ElTypeNum == ThotType &&
  909. elType.ElSSchema == ThotSSchema)
  910. ret = TRUE;
  911. }
  912. i--;
  913. }
  914. return ret;
  915. }
  916. /*----------------------------------------------------------------------
  917. HTMLParseError print the error message msg on stderr.
  918. If lineNumber = 0, print the current line number in the source file,
  919. otherwise print the line number provided.
  920. ----------------------------------------------------------------------*/
  921. void HTMLParseError (Document doc, const char* msg, int lineNumber)
  922. {
  923. if (IgnoreErrors)
  924. return;
  925. HTMLErrorsFound = TRUE;
  926. if (!ErrFile)
  927. if (OpenParsingErrors (doc) == FALSE)
  928. return;
  929. if (doc == HTMLcontext.doc)
  930. {
  931. /* the error message is related to the document being parsed */
  932. if (docURL != NULL)
  933. {
  934. if (!XMLErrorsFound)
  935. fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL);
  936. TtaFreeMemory (docURL);
  937. docURL = NULL;
  938. }
  939. else
  940. {
  941. if (CSSErrorsFound && docURL2)
  942. {
  943. fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL2);
  944. TtaFreeMemory (docURL2);
  945. docURL2 = NULL;
  946. }
  947. }
  948. if (lineNumber <= 0)
  949. /* print the line number and character number before the message */
  950. fprintf (ErrFile, "@ line %d, char %d: %s\n", NumberOfLinesRead,
  951. NumberOfCharRead, msg);
  952. else
  953. fprintf (ErrFile, "@ line %d, char 0: %s\n", lineNumber, msg);
  954. }
  955. else
  956. /* print only the error message */
  957. fprintf (ErrFile, "%s\n", msg);
  958. }
  959. /*----------------------------------------------------------------------
  960. CloseBuffer close the input buffer.
  961. ----------------------------------------------------------------------*/
  962. static void CloseBuffer ()
  963. {
  964. inputBuffer[LgBuffer] = EOS;
  965. }
  966. /*----------------------------------------------------------------------
  967. InitBuffer initialize the input buffer.
  968. ----------------------------------------------------------------------*/
  969. static void InitBuffer ()
  970. {
  971. LgBuffer = 0;
  972. }
  973. static ThotBool InsertElement (Element * el);
  974. /*----------------------------------------------------------------------
  975. InsertSibling return TRUE if the new element must be inserted
  976. in the Thot document as a sibling of lastElement;
  977. return FALSE it it must be inserted as a child.
  978. ----------------------------------------------------------------------*/
  979. static ThotBool InsertSibling ()
  980. {
  981. if (StackLevel == 0)
  982. return FALSE;
  983. else if (HTMLcontext.lastElementClosed ||
  984. TtaIsLeaf (TtaGetElementType (HTMLcontext.lastElement)) ||
  985. (GINumberStack[StackLevel - 1] >= 0 &&
  986. pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLcontents == 'E'))
  987. return TRUE;
  988. else
  989. return FALSE;
  990. }
  991. /*----------------------------------------------------------------------
  992. IsEmptyElement return TRUE if element el is defined as an empty element.
  993. ----------------------------------------------------------------------*/
  994. static ThotBool IsEmptyElement (Element el)
  995. {
  996. ElementType elType;
  997. int i;
  998. ThotBool ret;
  999. ret = FALSE;
  1000. elType = TtaGetElementType (el);
  1001. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1002. return ret;
  1003. i = 0;
  1004. while (EmptyElement[i] > 0 && EmptyElement[i] != elType.ElTypeNum)
  1005. i++;
  1006. if (EmptyElement[i] == elType.ElTypeNum)
  1007. ret = TRUE;
  1008. return ret;
  1009. }
  1010. /*----------------------------------------------------------------------
  1011. IsCharacterLevelType return TRUE if element type is a
  1012. character level element, FALSE if not.
  1013. ----------------------------------------------------------------------*/
  1014. ThotBool IsCharacterLevelType (ElementType elType)
  1015. {
  1016. int i;
  1017. ThotBool ret;
  1018. ret = FALSE;
  1019. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1020. return ret;
  1021. i = 0;
  1022. while (CharLevelElement[i] > 0 &&
  1023. CharLevelElement[i] != elType.ElTypeNum)
  1024. i++;
  1025. if (CharLevelElement[i] == elType.ElTypeNum)
  1026. ret = TRUE;
  1027. return ret;
  1028. }
  1029. /*----------------------------------------------------------------------
  1030. IsCharacterLevelElement return TRUE if element el is a
  1031. character level element, FALSE if not.
  1032. ----------------------------------------------------------------------*/
  1033. ThotBool IsCharacterLevelElement (Element el)
  1034. {
  1035. ElementType elType;
  1036. elType = TtaGetElementType (el);
  1037. return IsCharacterLevelType (elType);
  1038. }
  1039. /*----------------------------------------------------------------------
  1040. IsBlockElementType return TRUE if element type is a block element.
  1041. Same as IsBlockElement but just with the element type.
  1042. ----------------------------------------------------------------------*/
  1043. ThotBool IsBlockElementType (ElementType elType)
  1044. {
  1045. int i;
  1046. ThotBool ret;
  1047. ret = FALSE;
  1048. if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
  1049. return ret;
  1050. i = 0;
  1051. while (BlockLevelElement[i] > 0 &&
  1052. BlockLevelElement[i] != elType.ElTypeNum)
  1053. i++;
  1054. if (BlockLevelElement[i] == elType.ElTypeNum)
  1055. ret = TRUE;
  1056. return ret;
  1057. }
  1058. /*----------------------------------------------------------------------
  1059. IsBlockElement return TRUE if element el is a block element.
  1060. ----------------------------------------------------------------------*/
  1061. ThotBool IsBlockElement (Element el)
  1062. {
  1063. ElementType elType;
  1064. elType = TtaGetElementType (el);
  1065. return IsBlockElementType (elType);
  1066. }
  1067. /*----------------------------------------------------------------------
  1068. TextToDocument Put the content of input buffer in the document.
  1069. ----------------------------------------------------------------------*/
  1070. static void TextToDocument ()
  1071. {
  1072. ElementType elType;
  1073. Element elText, parent;
  1074. int i;
  1075. ThotBool ignoreLeadingSpaces;
  1076. ThotBool insSibling, ok;
  1077. CloseBuffer ();
  1078. if (HTMLcontext.lastElement)
  1079. {
  1080. i = 0;
  1081. insSibling = InsertSibling ();
  1082. ignoreLeadingSpaces = IsLeadingSpaceUseless (HTMLcontext.lastElement,
  1083. HTMLcontext.doc, insSibling, FALSE);
  1084. if (ignoreLeadingSpaces &&
  1085. !Within (HTML_EL_Preformatted, DocumentSSchema) &&
  1086. !Within (HTML_EL_STYLE_, DocumentSSchema) &&
  1087. !Within (HTML_EL_SCRIPT_, DocumentSSchema))
  1088. /* suppress leading spaces */
  1089. while (inputBuffer[i] <= SPACE && inputBuffer[i] != EOS)
  1090. i++;
  1091. if (inputBuffer[i] != EOS)
  1092. {
  1093. elType = TtaGetElementType (HTMLcontext.lastElement);
  1094. if (elType.ElTypeNum == HTML_EL_TEXT_UNIT && HTMLcontext.mergeText)
  1095. TtaAppendTextContent (HTMLcontext.lastElement, (unsigned char *)&(inputBuffer[i]),
  1096. HTMLcontext.doc);
  1097. else
  1098. {
  1099. if (inputBuffer[i] == SPACE && LgBuffer == 1)
  1100. {
  1101. // avoid to generate an empty pseudo paragraph
  1102. ok = FALSE;
  1103. if (InsertSibling ())
  1104. parent = TtaGetParent (HTMLcontext.lastElement);
  1105. else
  1106. parent = HTMLcontext.lastElement;
  1107. if (parent)
  1108. {
  1109. elType = TtaGetElementType (parent);
  1110. if (IsCharacterLevelElement (parent) ||
  1111. !XhtmlCannotContainText (elType))
  1112. ok = TRUE; // generate the TEXT element
  1113. }
  1114. }
  1115. else
  1116. ok = TRUE;
  1117. if (ok)
  1118. {
  1119. /* create a TEXT element */
  1120. elType.ElSSchema = DocumentSSchema;
  1121. elType.ElTypeNum = HTML_EL_TEXT_UNIT;
  1122. elText = TtaNewElement (HTMLcontext.doc, elType);
  1123. TtaSetElementLineNumber (elText, BufferLineNumber);
  1124. InsertElement (&elText);
  1125. HTMLcontext.lastElementClosed = TRUE;
  1126. HTMLcontext.mergeText = TRUE;
  1127. /* put the content of the input buffer into the TEXT element */
  1128. if (elText)
  1129. TtaSetTextContent (elText, (unsigned char *)&(inputBuffer[i]),
  1130. HTMLcontext.language, HTMLcontext.doc);
  1131. }
  1132. }
  1133. }
  1134. }
  1135. InitBuffer ();
  1136. }
  1137. /*----------------------------------------------------------------------
  1138. StartOfTag Beginning of a HTML tag (start or end tag).
  1139. Put the preceding text into the Thot document.
  1140. ----------------------------------------------------------------------*/
  1141. static void StartOfTag (char c)
  1142. {
  1143. if (LgBuffer > 0)
  1144. TextToDocument ();
  1145. HTMLcontext.mergeText = FALSE;
  1146. StartOfT

Large files files are truncated, but you can click here to view the full file