PageRenderTime 83ms CodeModel.GetById 28ms app.highlight 45ms RepoModel.GetById 1ms app.codeStats 0ms

/XmlNode.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 741 lines | 427 code | 75 blank | 239 comment | 147 complexity | 0b5bf25f53f329228bee680be032c574 MD5 | raw file
  1#include "gb-include.h"
  2
  3#include "XmlNode.h"
  4#include "Mem.h"
  5
  6// . Here's a nice list of all the html nodes names, lengths, whether they're
  7//   a breaking node or not and their node id
  8// . isVisible is true if text in between front and end tags is visible on page
  9// . isVisible is used by Xml::getText() 
 10// . filterKeep is 1 if we should keep it when &strip=1 is given when getting
 11//   the cached document. i added this for faisal
 12// . a filterKeep of 0 means remove tag and text between it and its back tag.
 13// . a filterKeep of 1 means keep the tag and text between it and its back tag.
 14// . a filterKeep of 2 means remove tag BUT keep the text between
 15//   it and its back tag. 
 16NodeType g_nodes[] = {
 17// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
 18//      isXml? (the last field)
 19// --------------------------
 20//  -- text node    ---  0
 21	{"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE	,0}, 
 22	//  -- xml tag node ---  1
 23	{"xmlTag"   , 1, 1, 1, 2,2, TAG_XMLTAG 	,0}, 
 24	{"A"        , 1, 0, 1, 1,1, TAG_A 	,0},
 25	{"ABBREV"   , 1, 1, 1, 2,2, TAG_ABBREV 	,0},
 26	{"ACRONYM"  , 1, 1, 1, 2,1, TAG_ACRONYM ,0},
 27	{"ADDRESS"  , 1, 1, 1, 2,2, TAG_ADDRESS ,0},
 28	{"APPLET"   , 1, 1, 1, 0,0, TAG_APPLET	,0},
 29	{"AREA"     , 0, 1, 1, 0,0, TAG_AREA	,0}, 	
 30	{"AU"       , 1, 1, 1, 0,0, TAG_AU	,0}, 	
 31	{"AUTHOR"   , 1, 1, 1, 0,0, TAG_AUTHOR	,0},
 32	{"B"        , 1, 0, 1, 1,1, TAG_B	,0},
 33	{"BANNER"   , 1, 1, 1, 0,0, TAG_BANNER	,0}, 	
 34	{"BASE"     , 0, 1, 1, 0,0, TAG_BASE	,0}, 	
 35	{"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT	,0}, 	
 36	{"BGSOUND"  , 0, 1, 1, 0,0, TAG_BGSOUND	,0}, 
 37	{"BIG"      , 1, 0, 1, 2,1, TAG_BIG	,0}, 	
 38	{"BLINK"    , 1, 0, 1, 2,2, TAG_BLINK	,0}, 	
 39	{"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE	,0},
 40	{"BQ"       , 1, 1, 1, 0,0, TAG_BQ	,0}, 	
 41	{"BODY"     , 1, 1, 1, 1,1, TAG_BODY	,0}, 	
 42	{"BR"       , 0, 1, 1, 1,1, TAG_BR	,0},
 43	{"CAPTION"  , 1, 1, 1, 2,1, TAG_CAPTION	,0}, 	
 44	{"CENTER"   , 1, 1, 1, 1,1, TAG_CENTER	,0}, 	
 45	{"CITE"     , 1, 1, 1, 2,1, TAG_CITE	,0}, 	
 46	{"CODE"     , 1, 1, 1, 2,1, TAG_CODE	,0}, 	
 47	{"COL"      , 1, 1, 1, 2,2, TAG_COL	,0}, 
 48	{"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP	,0},
 49	{"CREDIT"   , 1, 1, 1, 0,0, TAG_CREDIT	,0}, 	
 50	{"DEL"      , 1, 1, 1, 2,1, TAG_DEL	,0}, 	
 51	{"DFN"      , 1, 1, 1, 2,1, TAG_DFN	,0}, 	
 52	{"DIR"      , 1, 1, 1, 0,0, TAG_DIR	,0}, 	
 53	// MDW: wtf, these have back tags!
 54	// MDW: ok, i fixed it!
 55	{"DIV"      , 1, 1, 1, 1,1, TAG_DIV	,0}, 	
 56	{"DL"       , 1, 1, 1, 1,1, TAG_DL	,0}, 
 57	// this may not have a back tag!	
 58	{"DT"       , 1, 1, 1, 1,1, TAG_DT	,0}, 
 59	// this may not have a back tag!
 60	{"DD"       , 1, 1, 1, 1,1, TAG_DD	,0}, 
 61	{"EM"       , 1, 0, 1, 2,1, TAG_EM	,0}, // emphasized text
 62	{"EMBED"    , 0, 1, 1, 0,0, TAG_EMBED	,0}, 	
 63	{"FIG"      , 1, 1, 1, 0,0, TAG_FIG	,0}, 	
 64	{"FN"       , 1, 1, 1, 0,0, TAG_FN	,0}, 	
 65	{"FONT"     , 1, 0, 1, 1,1, TAG_FONT	,0},
 66	{"FORM"     , 1, 1, 1, 2,2, TAG_FORM	,0},
 67	// this may not have a back tag!
 68	{"FRAME"    , 1, 1, 1, 0,0, TAG_FRAME	,0}, 
 69	{"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET	,0},
 70	{"H1"       , 1, 1, 1, 1,1, TAG_H1	,0},
 71	{"H2"       , 1, 1, 1, 1,1, TAG_H2	,0},
 72	{"H3"       , 1, 1, 1, 1,1, TAG_H3	,0},
 73	{"H4"       , 1, 1, 1, 1,1, TAG_H4	,0}, 
 74	{"H5"       , 1, 1, 1, 1,1, TAG_H5	,0}, 
 75	{"H6"       , 1, 1, 1, 1,1, TAG_H6	,0}, 
 76	{"HEAD"     , 1, 1, 1, 1,1, TAG_HEAD	,0}, 
 77	{"HR"       , 0, 1, 1, 1,1, TAG_HR	,0},
 78	{"HTML"     , 1, 1, 1, 1,1, TAG_HTML	,0}, 	
 79	{"I"        , 1, 0, 1, 2,1, TAG_I	,0},
 80	{"IFRAME"   , 1, 1, 1, 2,2, TAG_IFRAME	,0},
 81	// filter = 1,but tag is turned to alt 	
 82	{"IMG"      , 0, 1, 1, 1,1, TAG_IMG	,0},
 83	{"INPUT"    , 0, 1, 1, 0,0, TAG_INPUT	,0}, 	
 84	{"INS"      , 1, 1, 1, 2,1, TAG_INS	,0}, 	
 85	{"ISINDEX"  , 0, 1, 1, 0,0, TAG_ISINDEX	,0}, 
 86	{"KBD"      , 1, 1, 1, 2,1, TAG_KBD	,0}, 	
 87	{"LANG"     , 1, 1, 1, 0,0, TAG_LANG	,0}, 	
 88	{"LH"       , 1, 1, 1, 0,0, TAG_LH	,0},
 89	// this may or may not have a back tag 	
 90	{"LI"       , 1, 1, 1, 1,1, TAG_LI	,0},
 91	// this may or may not have a back tag 	
 92	{"LINK"     , 0, 1, 1, 0,0, TAG_LINK	,0}, 	
 93	{"LISTING"  , 1, 1, 1, 0,0, TAG_LISTING	,0},
 94	{"MAP"      , 1, 1, 1, 0,0, TAG_MAP	,0}, 
 95	// don't index marquee text	
 96	{"MARQUEE"  , 1, 1, 0, 2,2, TAG_MARQUEE	,0}, 
 97	{"MATH"     , 1, 1, 1, 0,0, TAG_MATH	,0}, 	
 98	{"MENU"     , 1, 1, 1, 1,1, TAG_MENU	,0}, 	
 99	{"META"     , 0, 1, 1, 1,1, TAG_META	,0}, 	
100	{"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL	,0}, 	
101	{"NOBR"     , 1, 0, 1, 0,0, TAG_NOBR	,0}, 	
102	{"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES	,0}, 	
103	{"NOTE"     , 1, 1, 1, 0,0, TAG_NOTE	,0}, 	
104	{"OL"       , 1, 1, 1, 1,1, TAG_OL	,0}, 
105	{"OVERLAY"  , 0, 1, 1, 0,0, TAG_OVERLAY	,0}, 
106	// this may not have a back tag!	
107	{"P"        , 0, 1, 1, 1,1, TAG_P	,0}, 
108	{"PARAM"    , 0, 1, 1, 0,0, TAG_PARAM	,0}, 	
109	{"PERSON"   , 1, 1, 1, 0,0, TAG_PERSON	,0}, 	
110	{"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT	,0},	
111	{"PRE"      , 1, 1, 1, 2,1, TAG_PRE	,0}, 	
112	{"Q"        , 1, 1, 1, 2,1, TAG_Q	,0}, 	
113	{"RANGE"    , 0, 1, 1, 0,0, TAG_RANGE	,0}, 
114	{"SAMP"     , 1, 1, 1, 2,1, TAG_SAMP	,0}, 	
115	{"SCRIPT"   , 1, 1, 0, 0,0, TAG_SCRIPT	,0}, 
116	{"SELECT"   , 1, 1, 0, 0,0, TAG_SELECT	,0}, 
117	{"SMALL"    , 1, 0, 1, 2,1, TAG_SMALL	,0}, 	
118	{"SPACER"   , 0, 1, 1, 2,1, TAG_SPACER	,0}, 	
119	{"SPOT"     , 0, 1, 1, 0,0, TAG_SPOT	,0}, 	
120	{"STRIKE"   , 1, 1, 1, 2,1, TAG_STRIKE	,0}, 	
121	{"STRONG"   , 1, 0, 1, 2,1, TAG_STRONG	,0}, 
122	{"SUB"      , 1, 0, 1, 2,2, TAG_SUB	,0}, 	
123	{"SUP"      , 1, 0, 1, 2,2, TAG_SUP	,0}, 	
124	{"TAB"      , 0, 1, 1, 0,0, TAG_TAB	,0}, 	
125	{"TABLE"    , 1, 1, 1, 1,1, TAG_TABLE	,0}, 
126	{"TBODY"    , 1, 1, 1, 1,1, TAG_TBODY	,0}, 	
127
128	// this may not have a back tag!
129	{"TD"       , 1, 1, 1, 1,1, TAG_TD	,0}, 
130	{"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA	,0}, 	
131	{"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW	,0}, 
132	{"TFOOT"    , 0, 1, 1, 0,0, TAG_TFOOT	,0}, 	
133	// this DOES have a back tag
134	{"TH"       , 1, 1, 1, 0,0, TAG_TH	,0},   
135	{"THEAD"    , 0, 1, 1, 0,0, TAG_THEAD	,0}, 	
136	{"TITLE"    , 1, 1, 1, 1,1, TAG_TITLE	,0},
137	// this may not have a back tag!
138 	{"TR"       , 1, 1, 1, 1,1, TAG_TR	,0},
139	{"TT"       , 1, 1, 1, 2,1, TAG_TT	,0}, 	
140
141	{"U"        , 1, 0, 1, 1,1, TAG_U	,0}, 
142	{"UL"       , 1, 0, 1, 1,1, TAG_UL	,0}, 
143	{"VAR"      , 1, 1, 1, 2,1, TAG_VAR	,0}, 	
144	{"WBR"      , 0, 1, 1, 0,0, TAG_WBR	,0}, 	
145	{"XMP"      , 1, 1, 1, 0,0, TAG_XMP	,0},   
146	{"!--"      , 0, 1, 1, 0,0, TAG_COMMENT	,0}, // comment tag!
147
148
149	{"OPTION"   , 0, 1, 1, 2,2, TAG_OPTION	,0}, 
150	{"STYLE"    , 1, 1, 0, 0,1, TAG_STYLE	,0}, 
151	// doctype tag <!DOCTYPE ...>
152	{"DOCTYPE"  , 0, 1, 1, 0,0, TAG_DOCTYPE	,0}, 
153	// used in office.microsoft.com <?xml ...>
154	{"XML"      , 0, 1, 1, 0,0, TAG_XML	,0}, 
155	// <start index> <stop index>
156	{"START"    , 0, 1, 1, 0,0, TAG_START	,0}, 
157	{"STOP"     , 0, 1, 1, 0,0, TAG_STOP	,0}, 
158	// . i added these tags for faisal, but don't really need them
159	//   since our XML tag condition handles this case
160	// . we can no longer treat as a generic XML tags since faisal wanted
161	//   the strip=2 option
162	{"SPAN"     , 1, 0, 1, 2,1, TAG_SPAN	,0}, // not breaking!
163	{"LEGEND"   , 1, 1, 1, 2,1, TAG_LEGEND	,0},
164	{"S"        , 1, 1, 1, 2,1, TAG_S	,0}, // strike tag
165
166	{"ABBR"     , 1, 0, 1, 2,1, TAG_ABBR	,0},
167	{"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA	,0}, // <![CDATA[ tag
168	{"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT,0},
169	{"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET,0},
170	// feedburner uses these in the xml
171	{"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK ,1},
172	// ahrefs uses these as links
173	{"RDF:RDF",0, 1, 1, 0,0, TAG_RDF ,1},
174	{"RSS",0, 1, 1, 0,0, TAG_RSS ,1},
175	{"FEED",0, 1, 1, 0,0, TAG_FEED ,1},
176
177	{"ITEM",1, 1, 0, 0,0, TAG_ITEM ,1},
178	{"ENTRY",1, 1, 0, 0,0, TAG_ENTRY ,1},
179	{"CHANNEL",1, 1, 0, 0,0, TAG_CHANNEL ,1},
180	{"ENCLOSURE",1, 1, 0, 0,0, TAG_ENCLOSURE ,0},
181	{"WEBLOG",0, 1, 0, 0,0, TAG_WEBLOG ,1},
182
183	{"GBFRAME", 1, 1, 1, 1,1, TAG_GBFRAME ,0}, 
184 	{"TC"       , 1, 1, 1, 1,1, TAG_TC	,0},// HACK: tbl column section
185	{"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE,1},
186
187	// facebook xml
188	{"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME,1},
189	{"END_TIME", 1, 1, 1, 1,1, TAG_FBENDTIME,1},
190	{"NAME", 1, 1, 1, 1,1, TAG_FBNAME,1},
191	{"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE,1},
192	{"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST,1},
193
194
195	{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
196	{"BUTTON"   , 1, 1, 1, 0,0, TAG_BUTTON	,0}, 	
197	{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},
198
199	// for sitemap.xml
200	{"LOC"     , 0, 1, 1, 0,0, TAG_LOC,0}
201	//{"BUTTON"   , 1, 1, 1, 2, 122,0},
202	//{"BDO"      , 1, 1, 1, 2, 123,0},
203	//{"LABEL"    , 1, 1, 1, 2, 124,0},
204	//{"LAYER"    , 1, 1, 1, 2, 125}
205};
206// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
207
208
209// . called by Xml class
210// . returns the length of the node
211// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
212int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
213	// save head of node
214	m_node        = node;
215
216	// sanity check
217	static bool s_check = false;
218	if ( ! s_check ) {
219		s_check = true;
220		// how many NodeTypes do we have in g_nodes?
221		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
222		// set the hash table
223		for ( int32_t i = 0 ; i < nn ; i++ ) {
224			// sanity
225			if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
226		}
227	}
228
229
230	// . reset this
231	// . need to do here instead of in Links.cpp because sometimes
232	//   we think an anchor tag indicates a link, but it is really
233	//   just an <a href="javascript:..."> function call and Links.cpp
234	//   ignored it but we are expecting this to be valid!
235	m_isSelfLink = 0;
236
237	// reset
238	//m_linkNum = -1;
239
240	// CDATA tag was identified in earlier versions as a text node. Now 
241	// it is identified as a CDATA tag node. But gb.conf and others always
242	// pass their version as 0
243	if ( node[0] == '<' &&
244	     node[1] == '!' &&
245	     node[2] == '[' &&
246	     node[3] == 'C' &&
247	     node[4] == 'D' &&
248	     node[5] == 'A' &&
249	     node[6] == 'T' &&
250	     node[7] == 'A' &&
251	     node[8] == '[' ) 
252		return setCDATANode ( node );
253
254	// if "node" isn't the start of a tag then set it as a Text Node
255	if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
256		// . set this node as a text node!
257		// . nodeId for text nodes is 0
258		m_nodeId     = 0;
259		m_node       = node;
260		m_hasBackTag = false;
261		m_hash       = 0;
262		int32_t i = 0;
263		//char inCDATA = 0;
264		// inc i as int32_t as it's NOT the beginning of a tag
265		while ( node[i] && 
266			(node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
267			i++;
268		m_nodeLen = i;
269		m_pairTagNum = -1;
270		return m_nodeLen;
271	}
272
273	// . see if it's a comment (node end is "-->" for comments)
274	// . comments are special cases
275	if  ( node[1] == '!' ) {
276		if ( node[2]=='-' && node[3]=='-' ) 
277			return setCommentNode ( node );
278		// this means comment too:
279		// <![if ....]>
280		if ( node[2]=='[' )
281			return setCommentNode2 ( node );
282	}
283
284	// . otherwise it's a regular tag
285	// . might be <!DOCTYPE ...> or something though
286	m_nodeLen = getTagLen ( node );//, version );
287	// . get the node's name's length (i-1)
288	// . node name ends at non alnum char 
289	// . we can have hyphens in node name (TODO: even at beginning???)
290	int32_t tagNameStart = 1;
291	// . skip over backslash in the back tags
292	// . or skip over / or ? or ! now
293	// . tag names must start with a letter, fwiw
294	if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
295	int32_t i = tagNameStart;
296	// skip i to end of tagName. this should only allow ascii chars
297	// to be "tag name chars"
298	for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
299	// set the tagName and tagNameLen
300	m_tagName    = &node [ tagNameStart ];
301	m_tagNameLen = i - tagNameStart;
302
303	// break point
304	//if ( m_tagNameLen == 3 && m_tagName[0]=='!' && 
305	//     m_tagName[1]=='-' && m_tagName[2]=='-' )
306	//	fprintf(stderr,"man!");
307	// . set the node's hash -- used cuz it's faster than strcmp
308	// . just hash the letters as upper case
309	// . tag names are never utf8, so use the ascii ha
310	m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);
311
312	// if we're pure xml, don't allow any html tags accept <!-- -->
313	if ( pureXml ) {
314		m_hasBackTag = true;
315		m_isBreaking = true;
316		m_isVisible  = true;
317		//m_nodeId     = TAG_XMLTAG;//1;
318		// this returns 1 if tag is not in the list
319		m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , 
320	}
321	// . determine if the nodeId for this node
322	// . determine if it breaks lines (for phrasing purposes)
323	else 
324		m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , 
325	                                //&m_isBreaking , &m_isVisible );
326
327
328	// . no back tag if / follow name
329	// . this was only for "pureXml" but now i do it for all tags!
330	if ( m_node [ m_nodeLen - 2 ] == '/' ) 	m_hasBackTag = false;
331	if ( m_node [ m_nodeLen - 2 ] == '?' ) 	m_hasBackTag = false;
332
333	return m_nodeLen;
334}
335
336// . return the length of a node starting at "node"
337int32_t getTagLen ( char *node ) { // , int32_t version ) {
338	// see if it's not a node
339	//if ( node[0] != '<' ) return 0;
340	// skip over first <
341	int32_t i ;
342	// . keep looping until we hit a < or > OR while we're in quotes
343	// . ignore < and > when they're in quotes
344	for ( i = 1 ; node[i] ; i++ ) {
345		// this switch should speed things up... no!
346		if ( node[i] != '<'  &&
347		     node[i] != '>'  &&
348		     node[i] != '\"' &&
349		     node[i] != '\''  )
350			continue;
351		// this is about 1.3 times faster than above (with -O2 on both)
352		//if ( ! is_tag_control_char ( node[i] ) ) continue;
353		if ( node[i] == '<' ) break;
354		if ( node[i] == '>' ) {
355			break;
356			//if ( node[i-1]!='b') break;
357			//if ( i -2 < 0      ) break;
358			//if ( node[i-2]!='g') break;
359			// we had a "gb>" which means that these 3 chars
360			// we originally a &gt; html encoded entity which
361			// we decoded for easier parsing
362			//continue;
363		}
364		//if (version >= 70 && version < 77) continue;
365
366		// we can have double quotes within single quotes
367		if ( node [ i ] == '\"' ) {
368			// scan back looking for equal sign...
369			int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) {
370				if ( is_wspace_a(node[k]) ) continue;
371				break;
372			}
373			if ( k <= 1 ) continue;
374			// . if an equal sign did not immediately preceed
375			//   this double quote then ignore the double quote
376			// . this now fixes the harwoodmuseum.org issue
377			//   talked about below
378			if ( node[k] != '=' ) continue;
379			// skip over this first quote
380			i++;
381			while ( node[i] && node[i]!='\"' ) {
382				// crap some pages have unbalanced quotes.
383				// see /test/doc.14541556377486183454.html
384				if ( node[i  ]=='>' && 
385				     node[i-1]=='\"' ) {
386					i--;
387					break;
388				}
389				// like an img tag hits a </a> for
390				// http://www.harwoodmuseum.org/press_deta
391				// il.php?ID=44
392				// BUT this fucks up
393				// onclick="tb_show('<b>Community Calendar</b>'
394				// on the </b> which is legitamately in quotes
395				//if ( node[i  ]=='<' && 
396				//     node[i+1]=='/' ) {
397				//	i--;
398				//	break;
399				//}
400				if ( node[i  ]=='>' && 
401				     node[i-1]==' ' &&
402				     node[i-2]=='\"' ) {
403					i--;
404					break;
405				}
406				// skip this char
407				i++;
408			}
409			// return the length if tag ended abuptly
410			if ( ! node[i] ) return i;
411			// back-to-back quotes? common mistake
412			if ( node[i+1] == '\"' ) i++;
413			continue;
414		}
415		// continue if we don't have a " '" or "='"
416		if ( node [ i ] != '\'' ) continue;
417		if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
418		// skip to end of quote
419		while ( node[i] && node[i]!='\'' ) i++;
420	}
421	// skip i over the >
422	if ( node[i] == '>' ) i++; 
423	// . else we found no closure outside of quotes so be more stringent
424	// . look for closure with regard to quotes
425	else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
426	// return the LENGTH of the whole node
427	return i ;
428}
429
430int32_t XmlNode::setCommentNode ( char *node ) {
431
432	m_nodeId      = TAG_COMMENT;
433	m_isBreaking  = true;
434	m_isVisible   = true;
435	m_hasBackTag  = false;
436	m_hash        = hash64 ( "!--" , 3 , 0LL );
437	m_node        = node;
438	m_tagName     = node + 1; // !--
439	m_tagNameLen  = 3;
440
441	// . compute node length
442	// . TODO: do we have to deal with quotes????
443	// . TODO: what about nested comments?
444	int32_t i;
445	for ( i = 3 ; node[i] ; i++ ) {
446		if ( node[i]   !='>' ) continue;
447		if ( node[i-1] !='-' ) continue;
448		if ( node[i-2] =='-' ) break;
449	}
450
451	// skip i over the >, if any (could be end of doc)
452	if ( node[i] == '>' ) i++;
453
454	m_nodeLen = i;
455
456	return i;
457}
458
459
460int32_t XmlNode::setCommentNode2 ( char *node ) {
461
462	m_nodeId      = TAG_COMMENT;
463	m_isBreaking  = false;//true;
464	m_isVisible   = false;//true;
465	m_hasBackTag  = false;
466	m_hash        = hash64 ( "![" , 2 , 0LL );
467	m_node        = node;
468	m_tagName     = node + 1;
469	m_tagNameLen  = 2;
470
471	// . compute node length
472	// . TODO: do we have to deal with quotes????
473	// . TODO: what about nested comments?
474	int32_t i;
475	for ( i = 2 ; node[i] ; i++ ) {
476		// look for ending of ]> like for <![if gt IE 6]>
477		if ( node[i]   !='>' ) continue;
478		if ( node[i-1] ==']' ) break;
479		// look for ending of --> like for <![endif]-->
480		if ( node[i-1] == '-' && node[i-2] == '-' ) break;
481	}
482
483	// skip i over the >, if any (could be end of doc)
484	if ( node[i] == '>' ) i++;
485
486	m_nodeLen = i;
487
488	return i;
489}
490
491int32_t XmlNode::setCDATANode ( char *node ) {
492
493	m_nodeId      = TAG_CDATA;
494	m_isBreaking  = true;
495	m_isVisible   = true;
496	m_hasBackTag  = false;
497	m_hash        = hash64 ( "![CDATA[" , 8 , 0LL );
498	m_node        = node;
499	m_tagName     = node + 1; // !--
500	m_tagNameLen  = 8;
501
502	// . compute node length
503	// . TODO: do we have to deal with quotes????
504	// . TODO: what about nested comments?
505	int32_t i;
506	for ( i = 8 ; node[i] ; i++ ) {
507		// seems like just ]] is good enough! don't need "]]>"
508		//if ( node[i]   !='>' ) continue;
509		if ( node[i  ] !=']' ) continue;
510		if ( node[i+1] !=']' ) continue;//{ i++; break; }
511		// but skip it if we got it
512		if ( node[i+2] !='>' ) continue;
513		//if ( node[i+2] == '>' ) { i+=3; break;}
514		i += 3;
515		break;
516		// if does not end in '>', skip the ']' anyway
517		// no! hurts regex ending in [0-9]
518		//i+=2; break;
519	}
520
521	// skip i over the >, if any (could be end of doc)
522	//if ( node[i] == '>' ) i++;
523
524	m_nodeLen = i;
525
526	return i;
527}
528
529// Return the value of the specified "field" within this node.
530// the case of "field" does not matter.
531char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
532	// reset this to 0
533	*valueLen = 0;
534	// scan for the field name in our node
535	int32_t flen = gbstrlen(field);
536	char inQuotes = '\0';
537	int32_t i;
538
539	// scan the characters in the node, looking for the field name in ascii
540	for ( i = 1; i + flen < m_nodeLen ; i++ ) {
541		// skip the field if it's quoted
542		if ( inQuotes) {
543			if (m_node[i] == inQuotes ) inQuotes = 0;
544			continue;
545		}
546		// set inQuotes to the quote if we're in quotes
547		if ( (m_node[i]=='\"' || m_node[i]=='\'')){ 
548			inQuotes = m_node[i];
549			continue;
550		} 
551		// a field name must be preceeded by non-alnum
552		if ( is_alnum_a ( m_node[i-1] ) ) continue;
553		// the first character of this field shout match field[0]
554		if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
555		// field just be immediately followed by an = or space
556		if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
557		// field names must match
558		if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
559		// break cuz we got a match for our field name
560		break;
561	}
562
563
564	// return NULL if no matching field
565	if ( i + flen >= m_nodeLen ) return NULL;
566
567	// advance i over the fieldname so it pts to = or space
568	i += flen;
569
570	// advance i over spaces
571	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
572
573	// advance over the equal sign, return NULL if does not exist
574	if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;
575
576	// advance i over spaces after the equal sign
577	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
578	
579	// now parse out the value of this field (could be in quotes)
580	inQuotes = '\0';
581
582	// set inQuotes to the quote if we're in quotes
583	if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; 
584
585	// mark this as the start of the value
586	int start=i;
587
588	// advance i until we hit a space, or we hit a that quote if inQuotes
589	if (inQuotes) {
590		while (i<m_nodeLen && m_node[i] != inQuotes ) 
591			i++;
592	}
593	else {
594		while ( i<m_nodeLen &&
595			!is_wspace_a(m_node[i])&&
596			m_node[i]!='>')
597			i++;
598	}
599
600	// set the length of the value
601	*valueLen = i - start;
602
603	// return a ptr to the value
604	return m_node + start;
605}
606
607#include "HashTableX.h"
608
609nodeid_t getTagId ( char *s , NodeType **retp ) {
610
611	// init table?
612	static bool s_init = false;
613	static HashTableX  s_ht;
614	static char s_buf[10000];
615	if ( ! s_init ) {
616		s_init = true;
617		s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
618		// how many NodeTypes do we have in g_nodes?
619		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
620		// set the hash table
621		for ( int32_t i = 0 ; i < nn ; i++ ) {
622			char *name = g_nodes[i].m_nodeName;
623			int32_t  nlen = gbstrlen(name);
624			int64_t h = hash64Upper_a ( name,nlen,0LL );
625			NodeType *nt = &g_nodes[i];
626			if ( ! s_ht.addKey(&h,&nt) ) { 
627				char *xx=NULL;*xx=0; }
628		}
629		// sanity
630		if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
631		// sanity test
632		nodeid_t tt = getTagId ( "br" );
633		if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
634	}
635
636
637	// find end of tag name. hyphens are ok to be in name.
638	// facebook uses underscores like <start_time>
639	char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
640	// hash it for lookup
641	int64_t h = hash64Upper_a ( s , e - s , 0 );
642	// look it up
643	NodeType **ntp = (NodeType **)s_ht.getValue(&h);
644	// assume none
645	if ( retp ) *retp = NULL;
646	// none?
647	if ( ! ntp ) return 0;
648	// got one
649	if ( retp ) *retp = *ntp;
650	// get id otherwise
651	return (*ntp)->m_nodeId;
652}
653
654// . returns the nodeId
655// . 0 means not a node
656// . 1 means it's an xml node
657// . > 1 is reserved for pre-defined html nodes
658nodeid_t XmlNode::setNodeInfo ( int64_t  nodeHash ){//  , char *hasBackTag ,
659	                        //char      *isBreaking , char *isVisible ) {
660	/*
661	// sanity check
662	static bool s_init = false;
663	if ( ! s_init ) { 
664		s_init = true;
665		// how many NodeTypes do we have in g_nodes?
666		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
667		// set the hash table
668		for ( int32_t i = 0 ; i < nn ; i++ ) {
669			// sanity check
670			if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
671		}
672	}
673	*/
674
675	// . we have a list of all node types called "g_nodes"
676	// . each node type is a NodeType struct
677	// . hash all these node types into a hash table by their node name
678	// . we have 108 node names so we'll use 512 buckets
679	// . given the hash of your node name you can look it up in this table
680	static bool      s_isHashed = false;
681	static int64_t s_hash [512];
682	static nodeid_t  s_num  [512];
683	// how many NodeTypes do we have in g_nodes?
684	static int32_t      s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType);
685	// we only need to fill in the hash table once since it's static
686	if ( s_isHashed ) goto ready;
687	// clear the hash table
688	memset ( s_hash , 0 , 8*512 );
689	// set the hash table
690	for ( int32_t i = 0 ; i < s_numNodeTypes ; i++ ) {
691		int64_t h = hash64Upper_a ( g_nodes[i].m_nodeName, 
692					    gbstrlen(g_nodes[i].m_nodeName),0LL);
693		//int32_t b = (uint64_t)h % 512;
694		int32_t b = (uint64_t)h & 511;
695		// debug msg
696	     //fprintf(stderr,"node #%"INT32" has bucket #%"INT32", hash =%"INT64"\n",i,b,h);
697		while ( s_hash[b] ) if ( ++b == 512 ) b = 0;
698		s_hash [ b ] = h;
699		s_num  [ b ] = i;
700	}
701	// set this to true so we don't do the hashing again
702	s_isHashed = true;
703
704 ready:
705	// look up nodeHash in hash table
706	//int32_t b = (uint64_t)nodeHash % 512;
707	int32_t b = (uint64_t)nodeHash & 511;
708	while ( s_hash[b] ) {
709		if (   s_hash[b] == nodeHash ) break;
710		if ( ++b == 512 ) b = 0;
711	}
712	// if it wasn't found it must be an xml node(or unrecognized html node)
713	if ( ! s_hash[b] ) {
714		// default is breaking, has back tag and is indexable
715		m_isBreaking = true;
716		m_hasBackTag = true;
717		m_isVisible  = true;
718		return 1; 
719	}
720	// otherwise extract the isBreaking and the nodeId from the hit bucket
721	int32_t n = s_num[b];
722	m_hasBackTag = g_nodes [ n ].m_hasBackTag;
723	m_isBreaking = g_nodes [ n ].m_isBreaking;
724	m_isVisible  = g_nodes [ n ].m_isVisible;
725	// return the tag/node Id
726	return g_nodes [ n ].m_nodeId;
727}
728
729int32_t getNumXmlNodes ( ) {
730	return (int32_t)sizeof(g_nodes) / sizeof(XmlNode);
731}
732
733#include "Words.h" // BACKBITCOMP
734
735bool isBreakingTagId ( nodeid_t tagId ) {
736	return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
737}
738
739bool hasBackTag ( nodeid_t tagId ) {
740	return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
741}