/amaya/html2thot.c
C | 7618 lines | 6067 code | 333 blank | 1218 comment | 1612 complexity | 29dab6b5f25c8433321a1eb755e2bd6a MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- /*
- *
- * (c) COPYRIGHT INRIA and W3C, 1996-2009
- * Please first read the full copyright statement in file COPYRIGHT.
- *
- */
- /*
- *
- * html2thot parses a HTML file and builds the corresponding abstract tree
- * for a Thot document of type HTML.
- *
- * Author: V. Quint
- * I. Vatton (W3C/INRIA): XML extension and Unicode
- */
- #define THOT_EXPORT extern
- #include "amaya.h"
- #include "css.h"
- #include "fetchHTMLname.h"
- #include "parser.h"
- #include "zlib.h"
- #include "AHTURLTools_f.h"
- #include "css_f.h"
- #include "EDITstyle_f.h"
- #include "fetchHTMLname_f.h"
- #include "fetchXMLname_f.h"
- #include "html2thot_f.h"
- #include "HTMLactions_f.h"
- #include "HTMLedit_f.h"
- #include "HTMLimage_f.h"
- #include "HTMLtable_f.h"
- #include "HTMLsave_f.h"
- #include "init_f.h"
- #include "styleparser_f.h"
- #include "UIcss_f.h"
- #include "XHTMLbuilder_f.h"
- #include "Xml2thot_f.h"
- #ifdef ANNOTATIONS
- #include "annotlib.h"
- #include "ANNOTtools_f.h"
- #endif /* ANNOTATIONS */
- /* tables defined in XHTMLbuilder.c */
- extern AttrValueMapping XhtmlAttrValueMappingTable[];
- extern XmlEntity XhtmlEntityTable[];
- typedef struct _UnicodeFallbackEntry
- {
- int unicodeVal; /* the Unicode code */
- int EightbitCode; /* the corresponding glyph to be used from
- the ISO Latin-1 or Symbol character set.
- if 0 < EightbitCode < 255, it's the Symbol code for the correct glyph
- if 1000 < EightbitCode < 1256, it's the ISO Latin-1 code + 1000 of an
- approaching glyph
- if 2000 < EightbitCode < 2256, it's the Symbol code + 2000 of an
- approaching glyph */
- }
- UnicodeFallbackEntry;
- UnicodeFallbackEntry UnicodeFallbackTable[] =
- {
- /* This table MUST be ordered according to the first field of each
- entry (Unicode code) */
- /* OElig */ {338, 1079}, /* latin capital ligature OE, U+0152 ISOlat2 */
- /* oelig */ {339, 1111}, /* latin small ligature oe, U+0153 ISOlat2 */
- /* Scaron */ {352, 1083}, /* latin capital letter S with caron, U+0160 ISOlat2 */
- /* scaron */ {353, 1115}, /* latin small letter s with caron, U+0161 ISOlat2 */
- /* Yuml */ {376, 1089}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */
- /* fnof */ {402, 166}, /* latin small f with hook = function = florin, U+0192 ISOtech */
- /* epsilon */ {603, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
- /* circ */ {710, 2217}, /* modifier letter circumflex accent, U+02C6 ISOpub */
- /* tilde */ {732, 1126}, /* small tilde, U+02DC ISOdia */
- /* hat */ {770, 1094}, /* small tilde, U+02DC ISOdia */
- /* UnderBar */ {818, 45}, /* U+0332 */
- /* Alpha */ {913, 65}, /* greek capital letter alpha, U+0391 */
- /* Beta */ {914, 66}, /* greek capital letter beta, U+0392 */
- /* Gamma */ {915, 71}, /* greek capital letter gamma, U+0393 ISOgrk3 */
- /* Delta */ {916, 68}, /* greek capital letter delta, U+0394 ISOgrk3 */
- /* Epsilon */ {917, 69}, /* greek capital letter epsilon, U+0395 */
- /* Zeta */ {918, 90}, /* greek capital letter zeta, U+0396 */
- /* Eta */ {919, 72}, /* greek capital letter eta, U+0397 */
- /* Theta */ {920, 81}, /* greek capital letter theta, U+0398 ISOgrk3 */
- /* Iota */ {921, 73}, /* greek capital letter iota, U+0399 */
- /* Kappa */ {922, 75}, /* greek capital letter kappa, U+039A */
- /* Lambda */ {923, 76}, /* greek capital letter lambda, U+039B ISOgrk3 */
- /* Mu */ {924, 77}, /* greek capital letter mu, U+039C */
- /* Nu */ {925, 78}, /* greek capital letter nu, U+039D */
- /* Xi */ {926, 88}, /* greek capital letter xi, U+039E ISOgrk3 */
- /* Omicron */ {927, 79}, /* greek capital letter omicron, U+039F */
- /* Pi */ {928, 80}, /* greek capital letter pi, U+03A0 ISOgrk3 */
- /* Rho */ {929, 82}, /* greek capital letter rho, U+03A1 */
- /* Sigma */ {931, 83}, /* greek capital letter sigma, U+03A3 ISOgrk3 */
- /* Tau */ {932, 84}, /* greek capital letter tau, U+03A4 */
- /* Upsilon */ {933, 85}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */
- /* Phi */ {934, 70}, /* greek capital letter phi, U+03A6 ISOgrk3 */
- /* Chi */ {935, 67}, /* greek capital letter chi, U+03A7 */
- /* Psi */ {936, 89}, /* greek capital letter psi, U+03A8 ISOgrk3 */
- /* Omega */ {937, 87}, /* greek capital letter omega, U+03A9 ISOgrk3 */
- /* alpha */ {945, 97}, /* greek small letter alpha, U+03B1 ISOgrk3 */
- /* beta */ {946, 98}, /* greek small letter beta, U+03B2 ISOgrk3 */
- /* gamma */ {947, 103}, /* greek small letter gamma, U+03B3 ISOgrk3 */
- /* delta */ {948, 100}, /* greek small letter delta, U+03B4 ISOgrk3 */
- /* epsilon */ {949, 101}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
- /* zeta */ {950, 122}, /* greek small letter zeta, U+03B6 ISOgrk3 */
- /* eta */ {951, 104}, /* greek small letter eta, U+03B7 ISOgrk3 */
- /* theta */ {952, 113}, /* greek small letter theta, U+03B8 ISOgrk3 */
- /* iota */ {953, 105}, /* greek small letter iota, U+03B9 ISOgrk3 */
- /* kappa */ {954, 107}, /* greek small letter kappa, U+03BA ISOgrk3 */
- /* lambda */ {955, 108}, /* greek small letter lambda, U+03BB ISOgrk3 */
- /* mu */ {956, 109}, /* greek small letter mu, U+03BC ISOgrk3 */
- /* nu */ {957, 110}, /* greek small letter nu, U+03BD ISOgrk3 */
- /* xi */ {958, 120}, /* greek small letter xi, U+03BE ISOgrk3 */
- /* omicron */ {959, 111}, /* greek small letter omicron, U+03BF NEW */
- /* pi */ {960, 112}, /* greek small letter pi, U+03C0 ISOgrk3 */
- /* rho */ {961, 114}, /* greek small letter rho, U+03C1 ISOgrk3 */
- /* sigmaf */ {962, 86}, /* greek small letter final sigma, U+03C2 ISOgrk3 */
- /* sigma */ {963, 115}, /* greek small letter sigma, U+03C3 ISOgrk3 */
- /* tau */ {964, 116}, /* greek small letter tau, U+03C4 ISOgrk3 */
- /* upsilon */ {965, 117}, /* greek small letter upsilon, U+03C5 ISOgrk3 */
- /* phi */ {966, 106}, /* greek small letter phi, U+03C6 ISOgrk3 */
- /* chi */ {967, 99}, /* greek small letter chi, U+03C7 ISOgrk3 */
- /* psi */ {968, 121}, /* greek small letter psi, U+03C8 ISOgrk3 */
- /* omega */ {969, 119}, /* greek small letter omega, U+03C9 ISOgrk3 */
- /* thetasym */ {977, 74}, /* greek small letter theta symbol, U+03D1 NEW */
- /* upsih */ {978, 161}, /* greek upsilon with hook symbol, U+03D2 NEW */
- /* phiv */ {981, 102}, /* greek U+03D5 ISOgrk3 */
- /* piv */ {982, 118}, /* greek pi symbol, U+03D6 ISOgrk3 */
- /* ensp */ {8194, 1130}, /* en space, U+2002 ISOpub */
- /* emsp */ {8195, 1160}, /* em space, U+2003 ISOpub */
- /* thinsp */ {8201, 1129}, /* thin space, U+2009 ISOpub */
- /* zwnj */ {8204, 1063}, /* zero width non-joiner, U+200C NEW RFC 2070 */
- /* zwj */ {8205, 1063}, /* zero width joiner, U+200D NEW RFC 2070 */
- /* lrm */ {8206, 1063}, /* left-to-right mark, U+200E NEW RFC 2070 */
- /* rlm */ {8207, 1063}, /* right-to-left mark, U+200F NEW RFC 2070 */
- /* ndash */ {8211, 2045}, /* en dash, U+2013 ISOpub */
- /* mdash */ {8212, 2190}, /* em dash, U+2014 ISOpub */
- /* horbar */ {8213, 190}, /* U+2015 */
- /* Verbar */ {8214, 189}, /* U+2016 */
- /* lsquo */ {8216, 1096}, /* left single quotation mark, U+2018 ISOnum */
- /* rsquo */ {8217, 1039}, /* right single quotation mark, U+2019 ISOnum */
- /* sbquo */ {8218, 1044}, /* single low-9 quotation mark, U+201A NEW */
- /* ldquo */ {8220, 1096}, /* left double quotation mark, U+201C ISOnum */
- /* rdquo */ {8221, 1039}, /* right double quotation mark, U+201D ISOnum */
- /* bdquo */ {8222, 1044}, /* double low-9 quotation mark, U+201E NEW */
- /* dagger */ {8224, 2042}, /* dagger, U+2020 ISOpub */
- /* Dagger */ {8225, 2042}, /* double dagger, U+2021 ISOpub */
- /* bull */ {8226, 183}, /* bullet = black small circle, U+2022 ISOpub */
- /* hellip */ {8230, 188}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */
- /* lre */ {8234, 1063}, /* left-to-right embed, U+202A NEW RFC 2070 */
- /* rle */ {8235, 1063}, /* right-to-left embed, U+202B NEW RFC 2070 */
- /* pdf */ {8236, 1063}, /* pop directional format, U+202C NEW RFC 2070 */
- /* lro */ {8237, 1063}, /* left-to-right override, U+202D NEW RFC 2070 */
- /* rlo */ {8238, 1063}, /* right-to-left override, U+202E NEW RFC 2070 */
- /* permil */ {8240, 2037}, /* per mille sign, U+2030 ISOtech */
- /* prime */ {8242, 162}, /* prime = minutes = feet, U+2032 ISOtech */
- /* Prime */ {8243, 178}, /* double prime = seconds = inches, U+2033 ISOtech */
- /* lsaquo */ {8249, 1060}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */
- /* rsaquo */ {8250, 1062}, /* single right-pointing angle quotation mark, U+203A ISO proposed */
- /* oline */ {8254, 1175}, /* overline = spacing overscore, U+203E NEW */
- /* frasl */ {8260, 164}, /* fraction slash, U+2044 NEW */
- /*ApplyFunction*/ {8289, 1129}, /* thin space, U+2009 ISOpub */
- /*InvisibleTimes*/ {8290, 1129}, /* thin space, U+2009 ISOpub */
- /*InvisibleComa*/ {8291, 1129}, /* thin space, U+2009 ISOpub */
- /* euro */ {8364, 2206}, /* euro sign, U+20AC NEW */
- /*TripleDot */ {8411, 188}, /* tdot, U+20DB ISOtech */
- /* image */ {8465, 193}, /* blackletter capital I = imaginary part, U+2111 ISOamso */
- /* copysf */ {8471, 211}, /* U+2117 */
- /* weierp */ {8472, 195}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */
- /* real */ {8476, 194}, /* blackletter capital R = real part symbol, U+211C ISOamso */
- /* trade */ {8482, 212}, /* trade mark sign, U+2122 ISOnum */
- /* alefsym */ {8501, 192}, /* alef symbol = first transfinite cardinal, U+2135 NEW */
- /*DifferentialD*/{8518, 1100}, /* U+2146 */
- /*ExponentialE*/{8519, 1101},/* */
- /*ImaginaryI*/ {8520, 1105},/* */
- /* larr */ {8592, 172}, /* leftwards arrow, U+2190 ISOnum */
- /* uarr */ {8593, 173}, /* upwards arrow, U+2191 ISOnum*/
- /* rarr */ {8594, 174}, /* rightwards arrow, U+2192 ISOnum */
- /* darr */ {8595, 175}, /* downwards arrow, U+2193 ISOnum */
- /* harr */ {8596, 171}, /* left right arrow, U+2194 ISOamsa */
- /* crarr */ {8629, 191}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
- /* lrarr */ {8646, 171}, /* U+21C6 */
- /* lArr */ {8656, 220}, /* leftwards double arrow, U+21D0 ISOtech */
- /* uArr */ {8657, 221}, /* upwards double arrow, U+21D1 ISOamsa */
- /* rArr */ {8658, 222}, /* rightwards double arrow, U+21D2 ISOtech */
- /* dArr */ {8659, 223}, /* downwards double arrow, U+21D3 ISOamsa */
- /* hArr */ {8660, 219}, /* left right double arrow, U+21D4 ISOamsa */
- /* forall */ {8704, 34}, /* for all, U+2200 ISOtech */
- /* part */ {8706, 182}, /* partial differential, U+2202 ISOtech */
- /* exist */ {8707, 36}, /* there exists, U+2203 ISOtech */
- /* empty */ {8709, 198}, /* empty set = null set = diameter, U+2205 ISOamso */
- /* nabla */ {8711, 209}, /* nabla = backward difference, U+2207 ISOtech */
- /* isin */ {8712, 206}, /* element of, U+2208 ISOtech */
- /* notin */ {8713, 207}, /* not an element of, U+2209 ISOtech */
- /* ni */ {8715, 39}, /* contains as member, U+220B ISOtech */
- /* prod */ {8719, 213}, /* n-ary product = product sign, U+220F ISOamsb */
- /* sum */ {8721, 229}, /* n-ary sumation, U+2211 ISOamsb */
- /* minus */ {8722, 45}, /* minus sign, U+2212 ISOtech */
- /* Backslash*/ {8726, 1092},/* U+8726 */
- /* lowast */ {8727, 42}, /* asterisk operator, U+2217 ISOtech */
- /* radic */ {8730, 214}, /* square root = radical sign, U+221A ISOtech */
- /* prop */ {8733, 181}, /* proportional to, U+221D ISOtech */
- /* infin */ {8734, 165}, /* infinity, U+221E ISOtech */
- /* ang */ {8736, 208}, /* angle, U+2220 ISOamso */
- /* VerticalBar*/ {8739, 1124}, /* */
- /* parallel */ {8741, 1124}, /* parallel, U+2225 ISOtech */
- /* and */ {8743, 217}, /* logical and = wedge, U+2227 ISOtech */
- /* or */ {8744, 218}, /* logical or = vee, U+2228 ISOtech */
- /* cap */ {8745, 199}, /* intersection = cap, U+2229 ISOtech */
- /* cup */ {8746, 200}, /* union = cup, U+222A ISOtech */
- /* int */ {8747, 242}, /* integral, U+222B ISOtech */
- /* there4 */ {8756, 92}, /* therefore, U+2234 ISOtech */
- /* Colon */ {8759, 58}, /* Colon, U+2237 */
- /* sim */ {8764, 126}, /* tilde operator = varies with = similar to, U+223C ISOtech */
- /*EqualTilde*/ {8770, 64}, /* U+2242 ISOamsr */
- /* cong */ {8773, 64}, /* approximately equal to, U+2245 ISOtech */
- /* asymp */ {8776, 187}, /* almost equal to = asymptotic to, U+2248 ISOamsr */
- /* ne */ {8800, 185}, /* not equal to, U+2260 ISOtech */
- /* equiv */ {8801, 186}, /* identical to, U+2261 ISOtech */
- /* le */ {8804, 163}, /* less-than or equal to, U+2264 ISOtech */
- /* ge */ {8805, 179}, /* greater-than or equal to, U+2265 ISOtech */
- /* sub */ {8834, 204}, /* subset of, U+2282 ISOtech */
- /* sup */ {8835, 201}, /* superset of, U+2283 ISOtech */
- /* nsub */ {8836, 203}, /* not a subset of, U+2284 ISOamsn */
- /* sube */ {8838, 205}, /* subset of or equal to, U+2286 ISOtech */
- /* supe */ {8839, 202}, /* superset of or equal to, U+2287 ISOtech */
- /* subne */ {8842, 203}, /* U+228A */
- /* oplus */ {8853, 197}, /* circled plus = direct sum, U+2295 ISOamsb */
- /* otimes */ {8855, 196}, /* circled times = vector product, U+2297 ISOamsb */
- /* DownTee */ {8868, 94}, /* U+22A4 ISOtech */
- /* perp */ {8869, 94}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
- /* Vee */ {8897, 218}, /* U+22C1 ISOamsb */
- /*Intersection*/ {8898, 199}, /* U+22C2 ISOamsb */
- /*Intersection*/ {8899, 200}, /* U+22C3 ISOamsb */
- /* Diamond */ {8900, 168}, /* diamond operator, U+22C4 ISOamsb */
- /* sdot */ {8901, 215}, /* dot operator, U+22C5 ISOamsb */
- /* star */ {8902, 1042},/* */
- /* Subset */ {8912, 204}, /* U+22D0 */
- /* Cap */ {8914, 199}, /* U+22D2 */
- /* Cup */ {8915, 199}, /* U+22D3 */
- /* lceil */ {8968, 233}, /* left ceiling = apl upstile, U+2308 ISOamsc */
- /* rceil */ {8969, 249}, /* right ceiling, U+2309 ISOamsc */
- /* lfloor */ {8970, 235}, /* left floor = apl downstile, U+230A ISOamsc */
- /* rfloor */ {8971, 251}, /* right floor, U+230B ISOamsc */
- /* lang */ {9001, 225}, /* left-pointing angle bracket = bra, U+2329 ISOtech */
- /* rang */ {9002, 241}, /* right-pointing angle bracket = ket, U+232A ISOtech */
- /* dtri */ {9663, 209}, /* lozenge, U+25BF */
- /* loz */ {9674, 224}, /* lozenge, U+25CA ISOpub */
- /* spades */ {9824, 170}, /* black spade suit, U+2660 ISOpub */
- /* clubs */ {9827, 167}, /* black club suit = shamrock, U+2663 ISOpub */
- /* hearts */ {9829, 169}, /* black heart suit = valentine, U+2665 ISOpub */
- /* diams */ {9830, 168}, /* black diamond suit, U+2666 ISOpub */
- /* lang */ {0x27E8, 225},/* U+27E8 Mathematical left angle bracket */
- /* rang */ {0x27E9, 241},/* U+27E9 Mathematical right angle bracket */
- /* And */ {10835, 217}, /* U+2A53 */
- /* Or */ {10836, 218}, /* U+2A54 */
- /* Equal */ {10869, 1061},/* */
- /* Not */ {10988, 216}, /* U+2AEC */
- /* OverBrace*/ {65079, 132}, /* U+FE37 */
- /*UnderBrace*/ {65080, 133}, /* U+FE38 */
- /* THE END */ {0, 0} /* last entry (required) */
- };
- typedef struct _ElemToBeChecked *PtrElemToBeChecked;
- typedef struct _ElemToBeChecked
- {
- Element Elem; /* the element to be checked */
- PtrElemToBeChecked nextElemToBeChecked;
- }
- ElemToBeChecked;
- /* empty elements */
- static int EmptyElement[] =
- {
- HTML_EL_AREA,
- HTML_EL_BASE,
- HTML_EL_BaseFont,
- HTML_EL_BR,
- HTML_EL_COL,
- HTML_EL_FRAME,
- HTML_EL_Horizontal_Rule,
- HTML_EL_IMG,
- HTML_EL_Input,
- HTML_EL_ISINDEX,
- HTML_EL_LINK,
- HTML_EL_META,
- HTML_EL_Parameter,
- HTML_EL_PICTURE_UNIT,
- 0};
- /* character level elements */
- static int CharLevelElement[] =
- {
- HTML_EL_TEXT_UNIT, HTML_EL_PICTURE_UNIT, HTML_EL_SYMBOL_UNIT,
- HTML_EL_Anchor,
- HTML_EL_Teletype_text, HTML_EL_Italic_text, HTML_EL_Bold_text,
- HTML_EL_Underlined_text, HTML_EL_Struck_text, HTML_EL_Big_text,
- HTML_EL_Small_text,
- HTML_EL_Emphasis, HTML_EL_Strong, HTML_EL_Def, HTML_EL_Code, HTML_EL_Sample,
- HTML_EL_Keyboard, HTML_EL_Variable_, HTML_EL_Cite, HTML_EL_ABBR,
- HTML_EL_ACRONYM,
- HTML_EL_Font_, HTML_EL_Quotation, HTML_EL_Subscript, HTML_EL_Superscript,
- HTML_EL_Span, HTML_EL_BDO, HTML_EL_ins, HTML_EL_del,
- HTML_EL_IMG, HTML_EL_Input,
- HTML_EL_Option, HTML_EL_OptGroup, HTML_EL_Option_Menu,
- HTML_EL_Text_Input, HTML_EL_Password_Input, HTML_EL_File_Input,
- HTML_EL_Checkbox_Input, HTML_EL_Radio_Input, HTML_EL_Submit_Input,
- HTML_EL_Reset_Input, HTML_EL_Hidden_Input, HTML_EL_Inserted_Text,
- HTML_EL_Button_Input, HTML_EL_BUTTON_,
- HTML_EL_LABEL,
- HTML_EL_BR, HTML_EL_ruby,
- HTML_EL_Object, HTML_EL_Basic_Elem,
- 0};
- /* block level elements, i.e. elements having a Line rule in the presentation
- schema fo the main view */
- static int BlockLevelElement[] =
- {
- HTML_EL_H1, HTML_EL_H2, HTML_EL_H3, HTML_EL_H4, HTML_EL_H5, HTML_EL_H6,
- HTML_EL_Paragraph, HTML_EL_Pseudo_paragraph, HTML_EL_Text_Area,
- HTML_EL_Term, HTML_EL_Address, HTML_EL_LEGEND, HTML_EL_CAPTION,
- HTML_EL_INS, HTML_EL_DEL, HTML_EL_Division,
- 0};
- /* start tags that imply the end of a current element */
- /* any tag of each line implies the end of the current element if the type of
- that element is in the same line */
- typedef char oneLine[100];
- static oneLine EquivEndingElem[] =
- {
- "dt dd li option",
- "h1 h2 h3 h4 h5 h6",
- "address pre listing xmp",
- ""
- };
- /* acording the HTML DTD, HR should be added to the 2nd line above, as it */
- /* is not allowed within a H1, H2, H3, etc. But we should tolerate that case */
- /* because many documents contain rules in headings... */
- /* start tags that imply the end of current element */
- static oneLine StartTagEndingElem[] =
- {
- "form closes form p p* hr h1 h2 h3 h4 h5 h6 dl ul ol menu dir address pre listing xmp head",
- "head closes p p*",
- "title closes p p*",
- "body closes head style script title p p*",
- "li closes p p* h1 h2 h3 h4 h5 h6 dl address pre listing xmp head",
- "hr closes p p* head",
- "h1 closes p p* head",
- "h2 closes p p* head",
- "h3 closes p p* head",
- "h4 closes p p* head",
- "h5 closes p p* head",
- "h6 closes p p* head",
- "dir closes p p* head",
- "address closes p p* head ul",
- "pre closes p p* head ul",
- "listing closes p p* head",
- "xmp closes p p* head",
- "blockquote closes p p* head",
- "dl closes p p* dt menu dir address pre listing xmp head",
- "dt closes p p* menu dir address pre listing xmp head",
- "dd closes p p* menu dir address pre listing xmp head",
- "ul closes p p* head ol menu dir address pre listing xmp",
- "ol closes p p* head ul",
- "menu closes p p* head ul",
- "p closes p p* head h1 h2 h3 h4 h5 h6",
- "p* closes p p* head",
- "div closes p p* head",
- "noscript closes p p* head",
- "center closes font b i p p* head",
- "a closes a",
- "caption closes p p*",
- "colgroup closes caption colgroup col p p*",
- "col closes caption col p p*",
- "table closes p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
- "th closes th td",
- "td closes th td",
- "tr closes th td tr caption col colgroup",
- "thead closes caption col colgroup",
- "tfoot closes th td tr caption col colgroup thead tbody",
- "tbody closes th td tr caption col colgroup thead tfoot tbody",
- "optgroup closes option",
- "fieldset closes legend p p* head h1 h2 h3 h4 h5 h6 pre listing xmp a",
- ""
- };
- typedef int State; /* a state of the automaton */
- extern int HTML_ENTRIES;
- static PtrClosedElement *FirstClosedElem;
- /* ---------------------- static variables ---------------------- */
- /* parser stack */
- #define MaxStack 200 /* maximum stack height */
- static int GINumberStack[MaxStack]; /* entry of pHTMLGIMapping */
- static Element ElementStack[MaxStack]; /* element in the Thot abstract
- tree */
- static int ThotLevel[MaxStack]; /* level of element in the Thot
- tree */
- static Language LanguageStack[MaxStack]; /* element language */
- static int StackLevel = 0; /* first free element on the
- stack */
- /* information about the input file */
- #define INPUT_FILE_BUFFER_SIZE 2000
- #define PREV_READ_CHARS 30
- static char FileBuffer[INPUT_FILE_BUFFER_SIZE+1];
- static char PreviousRead[PREV_READ_CHARS+1];
- static char *WorkBuffer = FileBuffer;
- static int LastCharInWorkBuffer = 0; /* last char. in the buffer */
- static int LastCharInPreviousRead = 0;
- static int CurrentBufChar = 0; /* current character read */
- static int StartOfTagIndx = 0; /* last "<" read */
- static int StartOfRead = 0;
- static char PreviousBufChar = EOS; /* previous character read */
- static char *InputText = NULL;
- static gzFile stream = 0;
- static int NumberOfLinesRead = 0;/* number of lines read in the
- file */
- static int NumberOfCharRead = 0; /* number of characters read in the
- current line */
- static ThotBool EmptyLine = TRUE; /* no printable character encountered
- yet in the current line */
- static ThotBool StartOfFile = TRUE; /* no printable character encountered
- yet in the file */
- static ThotBool AfterTagPRE = FALSE; /* <PRE> has just been read */
- static char* docURL = NULL; /* path or URL of the document */
- static char *docURL2 = NULL; /* save the docURL for some cases of parsing errors */
- /* Static variables used for the call to the XML parser */
- static ThotBool NotToReadFile = FALSE;
- static int PreviousNumberOfLinesRead = 0;
- static int PreviousNumberOfCharRead = 0;
- /* Boolean that indicates the end of a HTML file */
- /* It is a static variable because it is used in parameter */
- /* for the call of the new XML parser (EndOfStartGI) */
- static ThotBool EndOfHtmlFile;
- /* input buffer */
- #define MaxBufferLength 1000
- #define AllmostFullBuffer 700
- #define MaxMsgLength 300 /* maximum size of error messages */
- static unsigned char inputBuffer[MaxBufferLength];
- static int LgBuffer = 0; /* actual length of text in input
- buffer */
- static int BufferLineNumber = 0; /* line number in the source file of
- the beginning of the text
- contained in the buffer */
- /* information about the Thot document under construction */
- /* global data used by the HTML parser */
- static ParserData HTMLcontext = {0, ISO_8859_1, 0, NULL, 0,
- FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
- static SSchema DocumentSSchema = NULL; /* the HTML structure schema */
- static Element rootElement = NULL; /* root element of the document */
- static int lastElemEntry = 0; /* index in the pHTMLGIMapping of the
- element being created */
- static Attribute lastAttribute = NULL; /* last attribute created */
- static Attribute lastAttrElement = NULL;/* element with which the last
- attribute has been associated */
- static AttributeMapping *lastAttrEntry = NULL; /* entry in the AttributeMappingTable
- of the attribute being created */
- static ThotBool UnknownAttr = FALSE; /* the last attribute encountered is
- invalid */
- static ThotBool ReadingAnAttrValue = FALSE;
- static ThotBool TruncatedAttrValue = FALSE;
- static char *BufferAttrValue = NULL;
- static int LgBufferAttrValue = 0;
- static Element CommentText = NULL; /* TEXT element of the current
- Comment element */
- static Element ASPText = NULL; /* TEXT element of the current
- ASP element */
- static Element PIText = NULL; /* TEXT element of the current
- ASP element */
- static ThotBool UnknownTag = FALSE; /* the last start tag encountered is
- invalid */
- static ThotBool HTMLrootClosed = FALSE;
- static char *HTMLrootClosingTag = NULL;
- static PtrElemToBeChecked FirstElemToBeChecked = NULL;
- static PtrElemToBeChecked LastElemToBeChecked = NULL;
- /* automaton */
- static State currentState; /* current state of the automaton */
- static State returnState; /* return state from subautomaton */
- static ThotBool NormalTransition;
- static ThotBool CharProcessed;
- /* information about an entity being read */
- static char EntityName[MaxEntityLength];/* name of entity being read */
- static int LgEntityName = 0; /* length of entity name read so
- far */
- static int EntityTableEntry = 0; /* entry of the entity table that
- matches the entity read so far */
- static int CharRank = 0; /* rank of the last matching
- character in that entry */
- /* second char of an UTF-8 string */
- static unsigned char SecondByte[6] = {EOS, EOS, EOS, EOS, EOS, EOS};
- static void ProcessStartGI (const char* GIname);
- static void EndOfAttrValue (char c);
- /*----------------------------------------------------------------------
- ----------------------------------------------------------------------*/
- static const char *StrCaseStr (const char *str1, const char *str2)
- {
- char c;
- const char *ptr;
- int len;
- if (str1 == NULL || str2 == NULL)
- return NULL;
- c = *str2;
- len = strlen ((char *)str2);
- ptr = str1;
- while (*ptr != EOS)
- {
- if (tolower(*ptr) == c && !strncasecmp ((char *)str2, ptr, len))
- return ptr;
- else
- ptr++;
- }
- return NULL;
- }
- /*----------------------------------------------------------------------
- ----------------------------------------------------------------------*/
- char *SkipSep (char *ptr)
- {
- while (*ptr == SPACE || *ptr == ',')
- ptr++;
- return (ptr);
- }
- /*----------------------------------------------------------------------
- ----------------------------------------------------------------------*/
- char *SkipInt (char *ptr)
- {
- while (*ptr != EOS && *ptr != SPACE && *ptr != ',')
- ptr++;
- return (ptr);
- }
- /*----------------------------------------------------------------------
- ParseAreaCoords
- Computes x, y, width and height of the box from the coords attribute value.
- ----------------------------------------------------------------------*/
- void ParseAreaCoords (Element element, Document document)
- {
- ElementType elType;
- AttributeType attrType;
- Attribute attrCoords, attrX, attrY;
- Attribute attrW, attrH, attrShape;
- char *ptr3, *text;
- int x1, y1, x2, y2;
- int length, shape, r;
- /* Is it an AREA element */
- elType = TtaGetElementType (element);
- if (elType.ElTypeNum != HTML_EL_AREA)
- return;
- /* Search the coords attribute */
- attrType.AttrSSchema = elType.ElSSchema;
- attrType.AttrTypeNum = HTML_ATTR_coords;
- attrCoords = TtaGetAttribute (element, attrType);
- if (attrCoords == NULL)
- return;
- /* Search the shape attribute */
- attrType.AttrTypeNum = HTML_ATTR_shape;
- attrShape = TtaGetAttribute (element, attrType);
- if (attrShape == NULL)
- /* no shape attribute. Create one with value rectangle */
- {
- attrShape = TtaNewAttribute (attrType);
- TtaAttachAttribute (element, attrShape, document);
- shape = HTML_ATTR_shape_VAL_rectangle;
- TtaSetAttributeValue (attrShape, shape, element, document);
- }
- else
- shape = TtaGetAttributeValue (attrShape);
- length = TtaGetTextAttributeLength (attrCoords);
- text = (char*)TtaGetMemory (length + 1);
- TtaGiveTextAttributeValue (attrCoords, text, &length);
- if (shape == HTML_ATTR_shape_VAL_rectangle ||
- shape == HTML_ATTR_shape_VAL_circle)
- {
- /* Search the x_coord attribute */
- attrType.AttrTypeNum = HTML_ATTR_x_coord;
- attrX = TtaGetAttribute (element, attrType);
- if (attrX == NULL)
- {
- /* create it */
- attrX = TtaNewAttribute (attrType);
- TtaAttachAttribute (element, attrX, document);
- }
- /* Search the y_coord attribute */
- attrType.AttrTypeNum = HTML_ATTR_y_coord;
- attrY = TtaGetAttribute (element, attrType);
- if (attrY == NULL)
- {
- /* create it */
- attrY = TtaNewAttribute (attrType);
- TtaAttachAttribute (element, attrY, document);
- }
- /* Search the width attribute */
- attrType.AttrTypeNum = HTML_ATTR_IntWidthPxl;
- attrW = TtaGetAttribute (element, attrType);
- if (attrW == NULL)
- {
- /* create it */
- attrW = TtaNewAttribute (attrType);
- TtaAttachAttribute (element, attrW, document);
- }
- /* Search the height attribute */
- attrType.AttrTypeNum = HTML_ATTR_IntHeightPxl;
- attrH = TtaGetAttribute (element, attrType);
- if (attrH == NULL)
- {
- /* create it */
- attrH = TtaNewAttribute (attrType);
- TtaAttachAttribute (element, attrH, document);
- }
- if (shape == HTML_ATTR_shape_VAL_rectangle)
- {
- x1 = x2 = y1 = y2 = 0;
- ptr3 = text;
- if (ptr3)
- sscanf (ptr3, "%d", &x1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- if (ptr3)
- sscanf (ptr3, "%d", &y1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- if (ptr3)
- sscanf (ptr3, "%d", &x2);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- sscanf (ptr3, "%d", &y2);
- TtaSetAttributeValue (attrX, x1, element, document);
- TtaSetAttributeValue (attrY, y1, element, document);
- TtaSetAttributeValue (attrW, x2 - x1, element, document);
- TtaSetAttributeValue (attrH, y2 - y1, element, document);
- }
- else
- {
- x1 = y1 = r = 0;
- ptr3 = text;
- if (ptr3)
- sscanf (ptr3, "%d", &x1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- if (ptr3)
- sscanf (ptr3, "%d", &y1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- if (ptr3)
- sscanf (ptr3, "%d", &r);
- TtaSetAttributeValue (attrX, x1 - r, element, document);
- TtaSetAttributeValue (attrY, y1 - r, element, document);
- TtaSetAttributeValue (attrW, 2 * r, element, document);
- TtaSetAttributeValue (attrH, 2 * r, element, document);
- }
- }
- else if (shape == HTML_ATTR_shape_VAL_polygon)
- {
- element = TtaGetFirstChild (element);
- length = TtaGetPolylineLength (element);
- /* remove previous points */
- while (length > 1)
- {
- TtaDeletePointInPolyline (element, length, document);
- length--;
- }
- length = 1;
- ptr3 = text;
- /* add new points */
- while (*ptr3 != EOS)
- {
- x1 = y1 = 0;
- sscanf (ptr3, "%d", &x1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- if (ptr3)
- sscanf (ptr3, "%d", &y1);
- ptr3 = SkipInt (ptr3);
- ptr3 = SkipSep (ptr3);
- TtaAddPointInPolyline (element, length, UnPixel, x1, y1,document,
- FALSE);
- length++;
- }
- }
- TtaFreeMemory (text);
- }
- /*----------------------------------------------------------------------
- SetLanguagInHTMLStack
- Sets the value of the language.
- ----------------------------------------------------------------------*/
- void SetLanguagInHTMLStack (Language lang)
- {
- LanguageStack[StackLevel - 1] = lang;
- }
- /*----------------------------------------------------------------------
- IsHtmlParsingCSS
- Returns the value of ParsingCSS boolean.
- ----------------------------------------------------------------------*/
- ThotBool IsHtmlParsingCSS ()
- {
- return HTMLcontext.parsingCSS;
- }
- /*----------------------------------------------------------------------
- SetHtmlParsingCSS
- Sets the value of ParsingCSS boolean.
- ----------------------------------------------------------------------*/
- void SetHtmlParsingCSS (ThotBool value)
- {
- HTMLcontext.parsingCSS = value;
- }
- /*----------------------------------------------------------------------
- SetHtmlParsingTextArea
- Sets the value of ParsingTextArea boolean.
- ----------------------------------------------------------------------*/
- void SetHtmlParsingTextArea (ThotBool value)
- {
- HTMLcontext.parsingTextArea = value;
- }
- /*----------------------------------------------------------------------
- SetHtmlParsingScript
- Sets the value of ParsingScript boolean.
- ----------------------------------------------------------------------*/
- void SetHtmlParsingScript (ThotBool value)
- {
- HTMLcontext.parsingScript = value;
- }
- /*----------------------------------------------------------------------
- SetHtmlElemLineNumber
- Assigns the current line number
- ----------------------------------------------------------------------*/
- void SetHtmlElemLineNumber (Element el)
- {
- TtaSetElementLineNumber (el, NumberOfLinesRead);
- }
- /*----------------------------------------------------------------------
- IsWithinHtmlTable
- Returns the value of WithinTable integer.
- ----------------------------------------------------------------------*/
- int IsWithinHtmlTable ()
- {
- return HTMLcontext.withinTable;
- }
- /*----------------------------------------------------------------------
- copyCEstring create a copy of the string of elements pointed
- by first and return a pointer on the first
- element of the copy.
- ----------------------------------------------------------------------*/
- static PtrClosedElement copyCEstring (PtrClosedElement first)
- {
- PtrClosedElement ret, cur, next, prev;
-
- ret = NULL;
- cur = first;
- prev = NULL;
- while (cur != NULL)
- {
- next = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
- next->nextClosedElem = NULL;
- next->tagNum = cur->tagNum;
- if (ret == NULL)
- ret = next;
- else
- prev->nextClosedElem = next;
- prev = next;
- cur = cur->nextClosedElem;
- }
- return ret;
- }
- /*----------------------------------------------------------------------
- InitMapping intialise the list of the elements closed by
- each start tag.
- ----------------------------------------------------------------------*/
- void InitMapping (void)
- {
- int line;
- int entry;
- int ptr;
- int i;
- typeName name;
- PtrClosedElement newCE, lastCE, firstCE, curCE;
- SSchema schema;
- /* building the table */
- FirstClosedElem = (PtrClosedElement *)TtaGetMemory (HTML_ENTRIES * sizeof(PtrClosedElement));
- for (entry = 0; entry < HTML_ENTRIES; entry++)
- FirstClosedElem[entry] = NULL;
- /* read table EquivEndingElem */
- line = 0;
- do
- /* read one line of EquivEndingElem */
- {
- ptr = 0;
- lastCE = NULL;
- firstCE = NULL;
- do
- {
- /* read one identifier */
- i = 0;
- while (EquivEndingElem[line][ptr] != SPACE &&
- EquivEndingElem[line][ptr] != EOS)
- name[i++] = EquivEndingElem[line][ptr++];
- name[i] = EOS;
- ptr++;
- if (i > 0)
- /* a identifier has been read */
- {
- schema = DocumentSSchema;
- entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
- #ifdef DEBUG
- if (entry < 0)
- fprintf (stderr, "error in EquivEndingElem: tag %s unknown in line\n%s\n", name, EquivEndingElem[line]);
- else
- #endif
- {
- newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
- newCE->nextClosedElem = NULL;
- newCE->tagNum = entry;
- if (firstCE == NULL)
- firstCE = newCE;
- else
- lastCE->nextClosedElem = newCE;
- lastCE = newCE;
- }
- }
- }
- while (EquivEndingElem[line][ptr] != EOS);
- /* one line has been read */
- curCE = firstCE;
- while (curCE != NULL)
- {
- if (curCE->nextClosedElem == NULL)
- newCE = firstCE;
- else
- newCE = copyCEstring (firstCE);
- if (FirstClosedElem[curCE->tagNum] == NULL)
- FirstClosedElem[curCE->tagNum] = newCE;
- else
- {
- lastCE = FirstClosedElem[curCE->tagNum];
- while (lastCE->nextClosedElem != NULL)
- lastCE = lastCE->nextClosedElem;
- lastCE->nextClosedElem = newCE;
- }
- curCE = curCE->nextClosedElem;
- }
- line++;
- }
- while (strcmp (EquivEndingElem[line], "") != 0);
- /* read table StartTagEndingElem */
- line = 0;
- do
- /* read one line of StartTagEndingElem */
- {
- ptr = 0;
- i = 0;
- /* read the first tag name of the line */
- while (StartTagEndingElem[line][ptr] != SPACE &&
- StartTagEndingElem[line][ptr] != EOS)
- name[i++] = StartTagEndingElem[line][ptr++];
- name[i] = EOS;
- i = 0;
- ptr++;
- schema = DocumentSSchema;
- entry = MapGI ((char *)name, &schema, HTMLcontext.doc);
- #ifdef DEBUG
- if (entry < 0)
- fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
- #endif
- /* read the keyword "closes" */
- while (StartTagEndingElem[line][ptr] != SPACE &&
- StartTagEndingElem[line][ptr] != EOS)
- name[i++] = StartTagEndingElem[line][ptr++];
- name[i] = EOS;
- i = 0;
- ptr++;
- #ifdef DEBUG
- if (strcmp (name, "closes") != 0)
- fprintf (stderr, "error in StartTagEndingElem: \"%s\" instead of \"closes\" in line\n%s\n", name, StartTagEndingElem[line]);
- #endif
- lastCE = FirstClosedElem[entry];
- if (lastCE != NULL)
- while (lastCE->nextClosedElem != NULL)
- lastCE = lastCE->nextClosedElem;
- do
- {
- while (StartTagEndingElem[line][ptr] != SPACE &&
- StartTagEndingElem[line][ptr] != EOS)
- name[i++] = StartTagEndingElem[line][ptr++];
- name[i] = EOS;
- ptr++;
- if (i > 0)
- {
- i = 0;
- newCE = (PtrClosedElement) TtaGetMemory (sizeof (ClosedElement));
- newCE->nextClosedElem = NULL;
- schema = DocumentSSchema;
- newCE->tagNum = MapGI ((char *)name, &schema, HTMLcontext.doc);
- #ifdef DEBUG
- if (newCE->tagNum < 0)
- fprintf (stderr, "error in StartTagEndingElem: tag %s unknown in line\n%s\n", name, StartTagEndingElem[line]);
- #endif
- if (lastCE == NULL)
- FirstClosedElem[entry] = newCE;
- else
- lastCE->nextClosedElem = newCE;
- lastCE = newCE;
- }
- }
- while (StartTagEndingElem[line][ptr] != EOS);
- line++;
- }
- while (strcmp (StartTagEndingElem[line], "") != 0);
- }
- /*----------------------------------------------------------------------
- Within checks if an element of type ThotType is in the stack.
- ----------------------------------------------------------------------*/
- static ThotBool Within (int ThotType, SSchema ThotSSchema)
- {
- ThotBool ret;
- int i;
- ElementType elType;
- ret = FALSE;
- i = StackLevel - 1;
- while (i >= 0 && !ret)
- {
- if (ElementStack[i] != NULL)
- {
- elType = TtaGetElementType (ElementStack[i]);
- if (elType.ElTypeNum == ThotType &&
- elType.ElSSchema == ThotSSchema)
- ret = TRUE;
- }
- i--;
- }
- return ret;
- }
- /*----------------------------------------------------------------------
- HTMLParseError print the error message msg on stderr.
- If lineNumber = 0, print the current line number in the source file,
- otherwise print the line number provided.
- ----------------------------------------------------------------------*/
- void HTMLParseError (Document doc, const char* msg, int lineNumber)
- {
- if (IgnoreErrors)
- return;
- HTMLErrorsFound = TRUE;
- if (!ErrFile)
- if (OpenParsingErrors (doc) == FALSE)
- return;
-
- if (doc == HTMLcontext.doc)
- {
- /* the error message is related to the document being parsed */
- if (docURL != NULL)
- {
- if (!XMLErrorsFound)
- fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL);
- TtaFreeMemory (docURL);
- docURL = NULL;
- }
- else
- {
- if (CSSErrorsFound && docURL2)
- {
- fprintf (ErrFile, "\n*** Errors/warnings in %s\n", docURL2);
- TtaFreeMemory (docURL2);
- docURL2 = NULL;
- }
- }
- if (lineNumber <= 0)
- /* print the line number and character number before the message */
- fprintf (ErrFile, "@ line %d, char %d: %s\n", NumberOfLinesRead,
- NumberOfCharRead, msg);
- else
- fprintf (ErrFile, "@ line %d, char 0: %s\n", lineNumber, msg);
- }
- else
- /* print only the error message */
- fprintf (ErrFile, "%s\n", msg);
- }
- /*----------------------------------------------------------------------
- CloseBuffer close the input buffer.
- ----------------------------------------------------------------------*/
- static void CloseBuffer ()
- {
- inputBuffer[LgBuffer] = EOS;
- }
- /*----------------------------------------------------------------------
- InitBuffer initialize the input buffer.
- ----------------------------------------------------------------------*/
- static void InitBuffer ()
- {
- LgBuffer = 0;
- }
- static ThotBool InsertElement (Element * el);
- /*----------------------------------------------------------------------
- InsertSibling return TRUE if the new element must be inserted
- in the Thot document as a sibling of lastElement;
- return FALSE it it must be inserted as a child.
- ----------------------------------------------------------------------*/
- static ThotBool InsertSibling ()
- {
- if (StackLevel == 0)
- return FALSE;
- else if (HTMLcontext.lastElementClosed ||
- TtaIsLeaf (TtaGetElementType (HTMLcontext.lastElement)) ||
- (GINumberStack[StackLevel - 1] >= 0 &&
- pHTMLGIMapping[GINumberStack[StackLevel - 1]].XMLcontents == 'E'))
- return TRUE;
- else
- return FALSE;
- }
- /*----------------------------------------------------------------------
- IsEmptyElement return TRUE if element el is defined as an empty element.
- ----------------------------------------------------------------------*/
- static ThotBool IsEmptyElement (Element el)
- {
- ElementType elType;
- int i;
- ThotBool ret;
- ret = FALSE;
- elType = TtaGetElementType (el);
- if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
- return ret;
- i = 0;
- while (EmptyElement[i] > 0 && EmptyElement[i] != elType.ElTypeNum)
- i++;
- if (EmptyElement[i] == elType.ElTypeNum)
- ret = TRUE;
- return ret;
- }
- /*----------------------------------------------------------------------
- IsCharacterLevelType return TRUE if element type is a
- character level element, FALSE if not.
- ----------------------------------------------------------------------*/
- ThotBool IsCharacterLevelType (ElementType elType)
- {
- int i;
- ThotBool ret;
- ret = FALSE;
- if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
- return ret;
- i = 0;
- while (CharLevelElement[i] > 0 &&
- CharLevelElement[i] != elType.ElTypeNum)
- i++;
- if (CharLevelElement[i] == elType.ElTypeNum)
- ret = TRUE;
- return ret;
- }
- /*----------------------------------------------------------------------
- IsCharacterLevelElement return TRUE if element el is a
- character level element, FALSE if not.
- ----------------------------------------------------------------------*/
- ThotBool IsCharacterLevelElement (Element el)
- {
- ElementType elType;
- elType = TtaGetElementType (el);
- return IsCharacterLevelType (elType);
- }
- /*----------------------------------------------------------------------
- IsBlockElementType return TRUE if element type is a block element.
- Same as IsBlockElement but just with the element type.
- ----------------------------------------------------------------------*/
- ThotBool IsBlockElementType (ElementType elType)
- {
- int i;
- ThotBool ret;
- ret = FALSE;
- if (strcmp (TtaGetSSchemaName (elType.ElSSchema), "HTML") != 0)
- return ret;
- i = 0;
- while (BlockLevelElement[i] > 0 &&
- BlockLevelElement[i] != elType.ElTypeNum)
- i++;
- if (BlockLevelElement[i] == elType.ElTypeNum)
- ret = TRUE;
- return ret;
- }
- /*----------------------------------------------------------------------
- IsBlockElement return TRUE if element el is a block element.
- ----------------------------------------------------------------------*/
- ThotBool IsBlockElement (Element el)
- {
- ElementType elType;
- elType = TtaGetElementType (el);
- return IsBlockElementType (elType);
- }
-
- /*----------------------------------------------------------------------
- TextToDocument Put the content of input buffer in the document.
- ----------------------------------------------------------------------*/
- static void TextToDocument ()
- {
- ElementType elType;
- Element elText, parent;
- int i;
- ThotBool ignoreLeadingSpaces;
- ThotBool insSibling, ok;
- CloseBuffer ();
- if (HTMLcontext.lastElement)
- {
- i = 0;
- insSibling = InsertSibling ();
- ignoreLeadingSpaces = IsLeadingSpaceUseless (HTMLcontext.lastElement,
- HTMLcontext.doc, insSibling, FALSE);
- if (ignoreLeadingSpaces &&
- !Within (HTML_EL_Preformatted, DocumentSSchema) &&
- !Within (HTML_EL_STYLE_, DocumentSSchema) &&
- !Within (HTML_EL_SCRIPT_, DocumentSSchema))
- /* suppress leading spaces */
- while (inputBuffer[i] <= SPACE && inputBuffer[i] != EOS)
- i++;
- if (inputBuffer[i] != EOS)
- {
- elType = TtaGetElementType (HTMLcontext.lastElement);
- if (elType.ElTypeNum == HTML_EL_TEXT_UNIT && HTMLcontext.mergeText)
- TtaAppendTextContent (HTMLcontext.lastElement, (unsigned char *)&(inputBuffer[i]),
- HTMLcontext.doc);
- else
- {
- if (inputBuffer[i] == SPACE && LgBuffer == 1)
- {
- // avoid to generate an empty pseudo paragraph
- ok = FALSE;
- if (InsertSibling ())
- parent = TtaGetParent (HTMLcontext.lastElement);
- else
- parent = HTMLcontext.lastElement;
- if (parent)
- {
- elType = TtaGetElementType (parent);
- if (IsCharacterLevelElement (parent) ||
- !XhtmlCannotContainText (elType))
- ok = TRUE; // generate the TEXT element
- }
- }
- else
- ok = TRUE;
- if (ok)
- {
- /* create a TEXT element */
- elType.ElSSchema = DocumentSSchema;
- elType.ElTypeNum = HTML_EL_TEXT_UNIT;
- elText = TtaNewElement (HTMLcontext.doc, elType);
- TtaSetElementLineNumber (elText, BufferLineNumber);
- InsertElement (&elText);
- HTMLcontext.lastElementClosed = TRUE;
- HTMLcontext.mergeText = TRUE;
- /* put the content of the input buffer into the TEXT element */
- if (elText)
- TtaSetTextContent (elText, (unsigned char *)&(inputBuffer[i]),
- HTMLcontext.language, HTMLcontext.doc);
- }
- }
- }
- }
- InitBuffer ();
- }
- /*----------------------------------------------------------------------
- StartOfTag Beginning of a HTML tag (start or end tag).
- Put the preceding text into the Thot document.
- ----------------------------------------------------------------------*/
- static void StartOfTag (char c)
- {
- if (LgBuffer > 0)
- TextToDocument ();
- HTMLcontext.mergeText = FALSE;
- StartOfT…
Large files files are truncated, but you can click here to view the full file