PageRenderTime 53ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/XmlNode.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 741 lines | 427 code | 75 blank | 239 comment | 147 complexity | 0b5bf25f53f329228bee680be032c574 MD5 | raw file
Possible License(s): Apache-2.0
  1. #include "gb-include.h"
  2. #include "XmlNode.h"
  3. #include "Mem.h"
  4. // . Here's a nice list of all the html nodes names, lengths, whether they're
  5. // a breaking node or not and their node id
  6. // . isVisible is true if text in between front and end tags is visible on page
  7. // . isVisible is used by Xml::getText()
  8. // . filterKeep is 1 if we should keep it when &strip=1 is given when getting
  9. // the cached document. i added this for faisal
  10. // . a filterKeep of 0 means remove tag and text between it and its back tag.
  11. // . a filterKeep of 1 means keep the tag and text between it and its back tag.
  12. // . a filterKeep of 2 means remove tag BUT keep the text between
  13. // it and its back tag.
  14. NodeType g_nodes[] = {
  15. // NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
  16. // isXml? (the last field)
  17. // --------------------------
  18. // -- text node --- 0
  19. {"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE ,0},
  20. // -- xml tag node --- 1
  21. {"xmlTag" , 1, 1, 1, 2,2, TAG_XMLTAG ,0},
  22. {"A" , 1, 0, 1, 1,1, TAG_A ,0},
  23. {"ABBREV" , 1, 1, 1, 2,2, TAG_ABBREV ,0},
  24. {"ACRONYM" , 1, 1, 1, 2,1, TAG_ACRONYM ,0},
  25. {"ADDRESS" , 1, 1, 1, 2,2, TAG_ADDRESS ,0},
  26. {"APPLET" , 1, 1, 1, 0,0, TAG_APPLET ,0},
  27. {"AREA" , 0, 1, 1, 0,0, TAG_AREA ,0},
  28. {"AU" , 1, 1, 1, 0,0, TAG_AU ,0},
  29. {"AUTHOR" , 1, 1, 1, 0,0, TAG_AUTHOR ,0},
  30. {"B" , 1, 0, 1, 1,1, TAG_B ,0},
  31. {"BANNER" , 1, 1, 1, 0,0, TAG_BANNER ,0},
  32. {"BASE" , 0, 1, 1, 0,0, TAG_BASE ,0},
  33. {"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT ,0},
  34. {"BGSOUND" , 0, 1, 1, 0,0, TAG_BGSOUND ,0},
  35. {"BIG" , 1, 0, 1, 2,1, TAG_BIG ,0},
  36. {"BLINK" , 1, 0, 1, 2,2, TAG_BLINK ,0},
  37. {"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE ,0},
  38. {"BQ" , 1, 1, 1, 0,0, TAG_BQ ,0},
  39. {"BODY" , 1, 1, 1, 1,1, TAG_BODY ,0},
  40. {"BR" , 0, 1, 1, 1,1, TAG_BR ,0},
  41. {"CAPTION" , 1, 1, 1, 2,1, TAG_CAPTION ,0},
  42. {"CENTER" , 1, 1, 1, 1,1, TAG_CENTER ,0},
  43. {"CITE" , 1, 1, 1, 2,1, TAG_CITE ,0},
  44. {"CODE" , 1, 1, 1, 2,1, TAG_CODE ,0},
  45. {"COL" , 1, 1, 1, 2,2, TAG_COL ,0},
  46. {"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP ,0},
  47. {"CREDIT" , 1, 1, 1, 0,0, TAG_CREDIT ,0},
  48. {"DEL" , 1, 1, 1, 2,1, TAG_DEL ,0},
  49. {"DFN" , 1, 1, 1, 2,1, TAG_DFN ,0},
  50. {"DIR" , 1, 1, 1, 0,0, TAG_DIR ,0},
  51. // MDW: wtf, these have back tags!
  52. // MDW: ok, i fixed it!
  53. {"DIV" , 1, 1, 1, 1,1, TAG_DIV ,0},
  54. {"DL" , 1, 1, 1, 1,1, TAG_DL ,0},
  55. // this may not have a back tag!
  56. {"DT" , 1, 1, 1, 1,1, TAG_DT ,0},
  57. // this may not have a back tag!
  58. {"DD" , 1, 1, 1, 1,1, TAG_DD ,0},
  59. {"EM" , 1, 0, 1, 2,1, TAG_EM ,0}, // emphasized text
  60. {"EMBED" , 0, 1, 1, 0,0, TAG_EMBED ,0},
  61. {"FIG" , 1, 1, 1, 0,0, TAG_FIG ,0},
  62. {"FN" , 1, 1, 1, 0,0, TAG_FN ,0},
  63. {"FONT" , 1, 0, 1, 1,1, TAG_FONT ,0},
  64. {"FORM" , 1, 1, 1, 2,2, TAG_FORM ,0},
  65. // this may not have a back tag!
  66. {"FRAME" , 1, 1, 1, 0,0, TAG_FRAME ,0},
  67. {"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET ,0},
  68. {"H1" , 1, 1, 1, 1,1, TAG_H1 ,0},
  69. {"H2" , 1, 1, 1, 1,1, TAG_H2 ,0},
  70. {"H3" , 1, 1, 1, 1,1, TAG_H3 ,0},
  71. {"H4" , 1, 1, 1, 1,1, TAG_H4 ,0},
  72. {"H5" , 1, 1, 1, 1,1, TAG_H5 ,0},
  73. {"H6" , 1, 1, 1, 1,1, TAG_H6 ,0},
  74. {"HEAD" , 1, 1, 1, 1,1, TAG_HEAD ,0},
  75. {"HR" , 0, 1, 1, 1,1, TAG_HR ,0},
  76. {"HTML" , 1, 1, 1, 1,1, TAG_HTML ,0},
  77. {"I" , 1, 0, 1, 2,1, TAG_I ,0},
  78. {"IFRAME" , 1, 1, 1, 2,2, TAG_IFRAME ,0},
  79. // filter = 1,but tag is turned to alt
  80. {"IMG" , 0, 1, 1, 1,1, TAG_IMG ,0},
  81. {"INPUT" , 0, 1, 1, 0,0, TAG_INPUT ,0},
  82. {"INS" , 1, 1, 1, 2,1, TAG_INS ,0},
  83. {"ISINDEX" , 0, 1, 1, 0,0, TAG_ISINDEX ,0},
  84. {"KBD" , 1, 1, 1, 2,1, TAG_KBD ,0},
  85. {"LANG" , 1, 1, 1, 0,0, TAG_LANG ,0},
  86. {"LH" , 1, 1, 1, 0,0, TAG_LH ,0},
  87. // this may or may not have a back tag
  88. {"LI" , 1, 1, 1, 1,1, TAG_LI ,0},
  89. // this may or may not have a back tag
  90. {"LINK" , 0, 1, 1, 0,0, TAG_LINK ,0},
  91. {"LISTING" , 1, 1, 1, 0,0, TAG_LISTING ,0},
  92. {"MAP" , 1, 1, 1, 0,0, TAG_MAP ,0},
  93. // don't index marquee text
  94. {"MARQUEE" , 1, 1, 0, 2,2, TAG_MARQUEE ,0},
  95. {"MATH" , 1, 1, 1, 0,0, TAG_MATH ,0},
  96. {"MENU" , 1, 1, 1, 1,1, TAG_MENU ,0},
  97. {"META" , 0, 1, 1, 1,1, TAG_META ,0},
  98. {"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL ,0},
  99. {"NOBR" , 1, 0, 1, 0,0, TAG_NOBR ,0},
  100. {"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES ,0},
  101. {"NOTE" , 1, 1, 1, 0,0, TAG_NOTE ,0},
  102. {"OL" , 1, 1, 1, 1,1, TAG_OL ,0},
  103. {"OVERLAY" , 0, 1, 1, 0,0, TAG_OVERLAY ,0},
  104. // this may not have a back tag!
  105. {"P" , 0, 1, 1, 1,1, TAG_P ,0},
  106. {"PARAM" , 0, 1, 1, 0,0, TAG_PARAM ,0},
  107. {"PERSON" , 1, 1, 1, 0,0, TAG_PERSON ,0},
  108. {"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT ,0},
  109. {"PRE" , 1, 1, 1, 2,1, TAG_PRE ,0},
  110. {"Q" , 1, 1, 1, 2,1, TAG_Q ,0},
  111. {"RANGE" , 0, 1, 1, 0,0, TAG_RANGE ,0},
  112. {"SAMP" , 1, 1, 1, 2,1, TAG_SAMP ,0},
  113. {"SCRIPT" , 1, 1, 0, 0,0, TAG_SCRIPT ,0},
  114. {"SELECT" , 1, 1, 0, 0,0, TAG_SELECT ,0},
  115. {"SMALL" , 1, 0, 1, 2,1, TAG_SMALL ,0},
  116. {"SPACER" , 0, 1, 1, 2,1, TAG_SPACER ,0},
  117. {"SPOT" , 0, 1, 1, 0,0, TAG_SPOT ,0},
  118. {"STRIKE" , 1, 1, 1, 2,1, TAG_STRIKE ,0},
  119. {"STRONG" , 1, 0, 1, 2,1, TAG_STRONG ,0},
  120. {"SUB" , 1, 0, 1, 2,2, TAG_SUB ,0},
  121. {"SUP" , 1, 0, 1, 2,2, TAG_SUP ,0},
  122. {"TAB" , 0, 1, 1, 0,0, TAG_TAB ,0},
  123. {"TABLE" , 1, 1, 1, 1,1, TAG_TABLE ,0},
  124. {"TBODY" , 1, 1, 1, 1,1, TAG_TBODY ,0},
  125. // this may not have a back tag!
  126. {"TD" , 1, 1, 1, 1,1, TAG_TD ,0},
  127. {"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA ,0},
  128. {"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW ,0},
  129. {"TFOOT" , 0, 1, 1, 0,0, TAG_TFOOT ,0},
  130. // this DOES have a back tag
  131. {"TH" , 1, 1, 1, 0,0, TAG_TH ,0},
  132. {"THEAD" , 0, 1, 1, 0,0, TAG_THEAD ,0},
  133. {"TITLE" , 1, 1, 1, 1,1, TAG_TITLE ,0},
  134. // this may not have a back tag!
  135. {"TR" , 1, 1, 1, 1,1, TAG_TR ,0},
  136. {"TT" , 1, 1, 1, 2,1, TAG_TT ,0},
  137. {"U" , 1, 0, 1, 1,1, TAG_U ,0},
  138. {"UL" , 1, 0, 1, 1,1, TAG_UL ,0},
  139. {"VAR" , 1, 1, 1, 2,1, TAG_VAR ,0},
  140. {"WBR" , 0, 1, 1, 0,0, TAG_WBR ,0},
  141. {"XMP" , 1, 1, 1, 0,0, TAG_XMP ,0},
  142. {"!--" , 0, 1, 1, 0,0, TAG_COMMENT ,0}, // comment tag!
  143. {"OPTION" , 0, 1, 1, 2,2, TAG_OPTION ,0},
  144. {"STYLE" , 1, 1, 0, 0,1, TAG_STYLE ,0},
  145. // doctype tag <!DOCTYPE ...>
  146. {"DOCTYPE" , 0, 1, 1, 0,0, TAG_DOCTYPE ,0},
  147. // used in office.microsoft.com <?xml ...>
  148. {"XML" , 0, 1, 1, 0,0, TAG_XML ,0},
  149. // <start index> <stop index>
  150. {"START" , 0, 1, 1, 0,0, TAG_START ,0},
  151. {"STOP" , 0, 1, 1, 0,0, TAG_STOP ,0},
  152. // . i added these tags for faisal, but don't really need them
  153. // since our XML tag condition handles this case
  154. // . we can no longer treat as a generic XML tags since faisal wanted
  155. // the strip=2 option
  156. {"SPAN" , 1, 0, 1, 2,1, TAG_SPAN ,0}, // not breaking!
  157. {"LEGEND" , 1, 1, 1, 2,1, TAG_LEGEND ,0},
  158. {"S" , 1, 1, 1, 2,1, TAG_S ,0}, // strike tag
  159. {"ABBR" , 1, 0, 1, 2,1, TAG_ABBR ,0},
  160. {"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA ,0}, // <![CDATA[ tag
  161. {"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT,0},
  162. {"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET,0},
  163. // feedburner uses these in the xml
  164. {"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK ,1},
  165. // ahrefs uses these as links
  166. {"RDF:RDF",0, 1, 1, 0,0, TAG_RDF ,1},
  167. {"RSS",0, 1, 1, 0,0, TAG_RSS ,1},
  168. {"FEED",0, 1, 1, 0,0, TAG_FEED ,1},
  169. {"ITEM",1, 1, 0, 0,0, TAG_ITEM ,1},
  170. {"ENTRY",1, 1, 0, 0,0, TAG_ENTRY ,1},
  171. {"CHANNEL",1, 1, 0, 0,0, TAG_CHANNEL ,1},
  172. {"ENCLOSURE",1, 1, 0, 0,0, TAG_ENCLOSURE ,0},
  173. {"WEBLOG",0, 1, 0, 0,0, TAG_WEBLOG ,1},
  174. {"GBFRAME", 1, 1, 1, 1,1, TAG_GBFRAME ,0},
  175. {"TC" , 1, 1, 1, 1,1, TAG_TC ,0},// HACK: tbl column section
  176. {"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE,1},
  177. // facebook xml
  178. {"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME,1},
  179. {"END_TIME", 1, 1, 1, 1,1, TAG_FBENDTIME,1},
  180. {"NAME", 1, 1, 1, 1,1, TAG_FBNAME,1},
  181. {"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE,1},
  182. {"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST,1},
  183. {"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
  184. {"BUTTON" , 1, 1, 1, 0,0, TAG_BUTTON ,0},
  185. {"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},
  186. // for sitemap.xml
  187. {"LOC" , 0, 1, 1, 0,0, TAG_LOC,0}
  188. //{"BUTTON" , 1, 1, 1, 2, 122,0},
  189. //{"BDO" , 1, 1, 1, 2, 123,0},
  190. //{"LABEL" , 1, 1, 1, 2, 124,0},
  191. //{"LAYER" , 1, 1, 1, 2, 125}
  192. };
  193. // NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]
  194. // . called by Xml class
  195. // . returns the length of the node
  196. // . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
  197. int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
  198. // save head of node
  199. m_node = node;
  200. // sanity check
  201. static bool s_check = false;
  202. if ( ! s_check ) {
  203. s_check = true;
  204. // how many NodeTypes do we have in g_nodes?
  205. static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
  206. // set the hash table
  207. for ( int32_t i = 0 ; i < nn ; i++ ) {
  208. // sanity
  209. if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
  210. }
  211. }
  212. // . reset this
  213. // . need to do here instead of in Links.cpp because sometimes
  214. // we think an anchor tag indicates a link, but it is really
  215. // just an <a href="javascript:..."> function call and Links.cpp
  216. // ignored it but we are expecting this to be valid!
  217. m_isSelfLink = 0;
  218. // reset
  219. //m_linkNum = -1;
  220. // CDATA tag was identified in earlier versions as a text node. Now
  221. // it is identified as a CDATA tag node. But gb.conf and others always
  222. // pass their version as 0
  223. if ( node[0] == '<' &&
  224. node[1] == '!' &&
  225. node[2] == '[' &&
  226. node[3] == 'C' &&
  227. node[4] == 'D' &&
  228. node[5] == 'A' &&
  229. node[6] == 'T' &&
  230. node[7] == 'A' &&
  231. node[8] == '[' )
  232. return setCDATANode ( node );
  233. // if "node" isn't the start of a tag then set it as a Text Node
  234. if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
  235. // . set this node as a text node!
  236. // . nodeId for text nodes is 0
  237. m_nodeId = 0;
  238. m_node = node;
  239. m_hasBackTag = false;
  240. m_hash = 0;
  241. int32_t i = 0;
  242. //char inCDATA = 0;
  243. // inc i as int32_t as it's NOT the beginning of a tag
  244. while ( node[i] &&
  245. (node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
  246. i++;
  247. m_nodeLen = i;
  248. m_pairTagNum = -1;
  249. return m_nodeLen;
  250. }
  251. // . see if it's a comment (node end is "-->" for comments)
  252. // . comments are special cases
  253. if ( node[1] == '!' ) {
  254. if ( node[2]=='-' && node[3]=='-' )
  255. return setCommentNode ( node );
  256. // this means comment too:
  257. // <![if ....]>
  258. if ( node[2]=='[' )
  259. return setCommentNode2 ( node );
  260. }
  261. // . otherwise it's a regular tag
  262. // . might be <!DOCTYPE ...> or something though
  263. m_nodeLen = getTagLen ( node );//, version );
  264. // . get the node's name's length (i-1)
  265. // . node name ends at non alnum char
  266. // . we can have hyphens in node name (TODO: even at beginning???)
  267. int32_t tagNameStart = 1;
  268. // . skip over backslash in the back tags
  269. // . or skip over / or ? or ! now
  270. // . tag names must start with a letter, fwiw
  271. if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
  272. int32_t i = tagNameStart;
  273. // skip i to end of tagName. this should only allow ascii chars
  274. // to be "tag name chars"
  275. for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
  276. // set the tagName and tagNameLen
  277. m_tagName = &node [ tagNameStart ];
  278. m_tagNameLen = i - tagNameStart;
  279. // break point
  280. //if ( m_tagNameLen == 3 && m_tagName[0]=='!' &&
  281. // m_tagName[1]=='-' && m_tagName[2]=='-' )
  282. // fprintf(stderr,"man!");
  283. // . set the node's hash -- used cuz it's faster than strcmp
  284. // . just hash the letters as upper case
  285. // . tag names are never utf8, so use the ascii ha
  286. m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);
  287. // if we're pure xml, don't allow any html tags accept <!-- -->
  288. if ( pureXml ) {
  289. m_hasBackTag = true;
  290. m_isBreaking = true;
  291. m_isVisible = true;
  292. //m_nodeId = TAG_XMLTAG;//1;
  293. // this returns 1 if tag is not in the list
  294. m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
  295. }
  296. // . determine if the nodeId for this node
  297. // . determine if it breaks lines (for phrasing purposes)
  298. else
  299. m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
  300. //&m_isBreaking , &m_isVisible );
  301. // . no back tag if / follow name
  302. // . this was only for "pureXml" but now i do it for all tags!
  303. if ( m_node [ m_nodeLen - 2 ] == '/' ) m_hasBackTag = false;
  304. if ( m_node [ m_nodeLen - 2 ] == '?' ) m_hasBackTag = false;
  305. return m_nodeLen;
  306. }
  307. // . return the length of a node starting at "node"
  308. int32_t getTagLen ( char *node ) { // , int32_t version ) {
  309. // see if it's not a node
  310. //if ( node[0] != '<' ) return 0;
  311. // skip over first <
  312. int32_t i ;
  313. // . keep looping until we hit a < or > OR while we're in quotes
  314. // . ignore < and > when they're in quotes
  315. for ( i = 1 ; node[i] ; i++ ) {
  316. // this switch should speed things up... no!
  317. if ( node[i] != '<' &&
  318. node[i] != '>' &&
  319. node[i] != '\"' &&
  320. node[i] != '\'' )
  321. continue;
  322. // this is about 1.3 times faster than above (with -O2 on both)
  323. //if ( ! is_tag_control_char ( node[i] ) ) continue;
  324. if ( node[i] == '<' ) break;
  325. if ( node[i] == '>' ) {
  326. break;
  327. //if ( node[i-1]!='b') break;
  328. //if ( i -2 < 0 ) break;
  329. //if ( node[i-2]!='g') break;
  330. // we had a "gb>" which means that these 3 chars
  331. // we originally a &gt; html encoded entity which
  332. // we decoded for easier parsing
  333. //continue;
  334. }
  335. //if (version >= 70 && version < 77) continue;
  336. // we can have double quotes within single quotes
  337. if ( node [ i ] == '\"' ) {
  338. // scan back looking for equal sign...
  339. int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) {
  340. if ( is_wspace_a(node[k]) ) continue;
  341. break;
  342. }
  343. if ( k <= 1 ) continue;
  344. // . if an equal sign did not immediately preceed
  345. // this double quote then ignore the double quote
  346. // . this now fixes the harwoodmuseum.org issue
  347. // talked about below
  348. if ( node[k] != '=' ) continue;
  349. // skip over this first quote
  350. i++;
  351. while ( node[i] && node[i]!='\"' ) {
  352. // crap some pages have unbalanced quotes.
  353. // see /test/doc.14541556377486183454.html
  354. if ( node[i ]=='>' &&
  355. node[i-1]=='\"' ) {
  356. i--;
  357. break;
  358. }
  359. // like an img tag hits a </a> for
  360. // http://www.harwoodmuseum.org/press_deta
  361. // il.php?ID=44
  362. // BUT this fucks up
  363. // onclick="tb_show('<b>Community Calendar</b>'
  364. // on the </b> which is legitamately in quotes
  365. //if ( node[i ]=='<' &&
  366. // node[i+1]=='/' ) {
  367. // i--;
  368. // break;
  369. //}
  370. if ( node[i ]=='>' &&
  371. node[i-1]==' ' &&
  372. node[i-2]=='\"' ) {
  373. i--;
  374. break;
  375. }
  376. // skip this char
  377. i++;
  378. }
  379. // return the length if tag ended abuptly
  380. if ( ! node[i] ) return i;
  381. // back-to-back quotes? common mistake
  382. if ( node[i+1] == '\"' ) i++;
  383. continue;
  384. }
  385. // continue if we don't have a " '" or "='"
  386. if ( node [ i ] != '\'' ) continue;
  387. if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
  388. // skip to end of quote
  389. while ( node[i] && node[i]!='\'' ) i++;
  390. }
  391. // skip i over the >
  392. if ( node[i] == '>' ) i++;
  393. // . else we found no closure outside of quotes so be more stringent
  394. // . look for closure with regard to quotes
  395. else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
  396. // return the LENGTH of the whole node
  397. return i ;
  398. }
  399. int32_t XmlNode::setCommentNode ( char *node ) {
  400. m_nodeId = TAG_COMMENT;
  401. m_isBreaking = true;
  402. m_isVisible = true;
  403. m_hasBackTag = false;
  404. m_hash = hash64 ( "!--" , 3 , 0LL );
  405. m_node = node;
  406. m_tagName = node + 1; // !--
  407. m_tagNameLen = 3;
  408. // . compute node length
  409. // . TODO: do we have to deal with quotes????
  410. // . TODO: what about nested comments?
  411. int32_t i;
  412. for ( i = 3 ; node[i] ; i++ ) {
  413. if ( node[i] !='>' ) continue;
  414. if ( node[i-1] !='-' ) continue;
  415. if ( node[i-2] =='-' ) break;
  416. }
  417. // skip i over the >, if any (could be end of doc)
  418. if ( node[i] == '>' ) i++;
  419. m_nodeLen = i;
  420. return i;
  421. }
  422. int32_t XmlNode::setCommentNode2 ( char *node ) {
  423. m_nodeId = TAG_COMMENT;
  424. m_isBreaking = false;//true;
  425. m_isVisible = false;//true;
  426. m_hasBackTag = false;
  427. m_hash = hash64 ( "![" , 2 , 0LL );
  428. m_node = node;
  429. m_tagName = node + 1;
  430. m_tagNameLen = 2;
  431. // . compute node length
  432. // . TODO: do we have to deal with quotes????
  433. // . TODO: what about nested comments?
  434. int32_t i;
  435. for ( i = 2 ; node[i] ; i++ ) {
  436. // look for ending of ]> like for <![if gt IE 6]>
  437. if ( node[i] !='>' ) continue;
  438. if ( node[i-1] ==']' ) break;
  439. // look for ending of --> like for <![endif]-->
  440. if ( node[i-1] == '-' && node[i-2] == '-' ) break;
  441. }
  442. // skip i over the >, if any (could be end of doc)
  443. if ( node[i] == '>' ) i++;
  444. m_nodeLen = i;
  445. return i;
  446. }
  447. int32_t XmlNode::setCDATANode ( char *node ) {
  448. m_nodeId = TAG_CDATA;
  449. m_isBreaking = true;
  450. m_isVisible = true;
  451. m_hasBackTag = false;
  452. m_hash = hash64 ( "![CDATA[" , 8 , 0LL );
  453. m_node = node;
  454. m_tagName = node + 1; // !--
  455. m_tagNameLen = 8;
  456. // . compute node length
  457. // . TODO: do we have to deal with quotes????
  458. // . TODO: what about nested comments?
  459. int32_t i;
  460. for ( i = 8 ; node[i] ; i++ ) {
  461. // seems like just ]] is good enough! don't need "]]>"
  462. //if ( node[i] !='>' ) continue;
  463. if ( node[i ] !=']' ) continue;
  464. if ( node[i+1] !=']' ) continue;//{ i++; break; }
  465. // but skip it if we got it
  466. if ( node[i+2] !='>' ) continue;
  467. //if ( node[i+2] == '>' ) { i+=3; break;}
  468. i += 3;
  469. break;
  470. // if does not end in '>', skip the ']' anyway
  471. // no! hurts regex ending in [0-9]
  472. //i+=2; break;
  473. }
  474. // skip i over the >, if any (could be end of doc)
  475. //if ( node[i] == '>' ) i++;
  476. m_nodeLen = i;
  477. return i;
  478. }
  479. // Return the value of the specified "field" within this node.
  480. // the case of "field" does not matter.
  481. char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
  482. // reset this to 0
  483. *valueLen = 0;
  484. // scan for the field name in our node
  485. int32_t flen = gbstrlen(field);
  486. char inQuotes = '\0';
  487. int32_t i;
  488. // scan the characters in the node, looking for the field name in ascii
  489. for ( i = 1; i + flen < m_nodeLen ; i++ ) {
  490. // skip the field if it's quoted
  491. if ( inQuotes) {
  492. if (m_node[i] == inQuotes ) inQuotes = 0;
  493. continue;
  494. }
  495. // set inQuotes to the quote if we're in quotes
  496. if ( (m_node[i]=='\"' || m_node[i]=='\'')){
  497. inQuotes = m_node[i];
  498. continue;
  499. }
  500. // a field name must be preceeded by non-alnum
  501. if ( is_alnum_a ( m_node[i-1] ) ) continue;
  502. // the first character of this field shout match field[0]
  503. if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
  504. // field just be immediately followed by an = or space
  505. if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
  506. // field names must match
  507. if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
  508. // break cuz we got a match for our field name
  509. break;
  510. }
  511. // return NULL if no matching field
  512. if ( i + flen >= m_nodeLen ) return NULL;
  513. // advance i over the fieldname so it pts to = or space
  514. i += flen;
  515. // advance i over spaces
  516. while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
  517. // advance over the equal sign, return NULL if does not exist
  518. if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;
  519. // advance i over spaces after the equal sign
  520. while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
  521. // now parse out the value of this field (could be in quotes)
  522. inQuotes = '\0';
  523. // set inQuotes to the quote if we're in quotes
  524. if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++];
  525. // mark this as the start of the value
  526. int start=i;
  527. // advance i until we hit a space, or we hit a that quote if inQuotes
  528. if (inQuotes) {
  529. while (i<m_nodeLen && m_node[i] != inQuotes )
  530. i++;
  531. }
  532. else {
  533. while ( i<m_nodeLen &&
  534. !is_wspace_a(m_node[i])&&
  535. m_node[i]!='>')
  536. i++;
  537. }
  538. // set the length of the value
  539. *valueLen = i - start;
  540. // return a ptr to the value
  541. return m_node + start;
  542. }
  543. #include "HashTableX.h"
  544. nodeid_t getTagId ( char *s , NodeType **retp ) {
  545. // init table?
  546. static bool s_init = false;
  547. static HashTableX s_ht;
  548. static char s_buf[10000];
  549. if ( ! s_init ) {
  550. s_init = true;
  551. s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
  552. // how many NodeTypes do we have in g_nodes?
  553. static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
  554. // set the hash table
  555. for ( int32_t i = 0 ; i < nn ; i++ ) {
  556. char *name = g_nodes[i].m_nodeName;
  557. int32_t nlen = gbstrlen(name);
  558. int64_t h = hash64Upper_a ( name,nlen,0LL );
  559. NodeType *nt = &g_nodes[i];
  560. if ( ! s_ht.addKey(&h,&nt) ) {
  561. char *xx=NULL;*xx=0; }
  562. }
  563. // sanity
  564. if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
  565. // sanity test
  566. nodeid_t tt = getTagId ( "br" );
  567. if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
  568. }
  569. // find end of tag name. hyphens are ok to be in name.
  570. // facebook uses underscores like <start_time>
  571. char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
  572. // hash it for lookup
  573. int64_t h = hash64Upper_a ( s , e - s , 0 );
  574. // look it up
  575. NodeType **ntp = (NodeType **)s_ht.getValue(&h);
  576. // assume none
  577. if ( retp ) *retp = NULL;
  578. // none?
  579. if ( ! ntp ) return 0;
  580. // got one
  581. if ( retp ) *retp = *ntp;
  582. // get id otherwise
  583. return (*ntp)->m_nodeId;
  584. }
  585. // . returns the nodeId
  586. // . 0 means not a node
  587. // . 1 means it's an xml node
  588. // . > 1 is reserved for pre-defined html nodes
  589. nodeid_t XmlNode::setNodeInfo ( int64_t nodeHash ){// , char *hasBackTag ,
  590. //char *isBreaking , char *isVisible ) {
  591. /*
  592. // sanity check
  593. static bool s_init = false;
  594. if ( ! s_init ) {
  595. s_init = true;
  596. // how many NodeTypes do we have in g_nodes?
  597. static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
  598. // set the hash table
  599. for ( int32_t i = 0 ; i < nn ; i++ ) {
  600. // sanity check
  601. if(g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
  602. }
  603. }
  604. */
  605. // . we have a list of all node types called "g_nodes"
  606. // . each node type is a NodeType struct
  607. // . hash all these node types into a hash table by their node name
  608. // . we have 108 node names so we'll use 512 buckets
  609. // . given the hash of your node name you can look it up in this table
  610. static bool s_isHashed = false;
  611. static int64_t s_hash [512];
  612. static nodeid_t s_num [512];
  613. // how many NodeTypes do we have in g_nodes?
  614. static int32_t s_numNodeTypes = sizeof(g_nodes) / sizeof(NodeType);
  615. // we only need to fill in the hash table once since it's static
  616. if ( s_isHashed ) goto ready;
  617. // clear the hash table
  618. memset ( s_hash , 0 , 8*512 );
  619. // set the hash table
  620. for ( int32_t i = 0 ; i < s_numNodeTypes ; i++ ) {
  621. int64_t h = hash64Upper_a ( g_nodes[i].m_nodeName,
  622. gbstrlen(g_nodes[i].m_nodeName),0LL);
  623. //int32_t b = (uint64_t)h % 512;
  624. int32_t b = (uint64_t)h & 511;
  625. // debug msg
  626. //fprintf(stderr,"node #%"INT32" has bucket #%"INT32", hash =%"INT64"\n",i,b,h);
  627. while ( s_hash[b] ) if ( ++b == 512 ) b = 0;
  628. s_hash [ b ] = h;
  629. s_num [ b ] = i;
  630. }
  631. // set this to true so we don't do the hashing again
  632. s_isHashed = true;
  633. ready:
  634. // look up nodeHash in hash table
  635. //int32_t b = (uint64_t)nodeHash % 512;
  636. int32_t b = (uint64_t)nodeHash & 511;
  637. while ( s_hash[b] ) {
  638. if ( s_hash[b] == nodeHash ) break;
  639. if ( ++b == 512 ) b = 0;
  640. }
  641. // if it wasn't found it must be an xml node(or unrecognized html node)
  642. if ( ! s_hash[b] ) {
  643. // default is breaking, has back tag and is indexable
  644. m_isBreaking = true;
  645. m_hasBackTag = true;
  646. m_isVisible = true;
  647. return 1;
  648. }
  649. // otherwise extract the isBreaking and the nodeId from the hit bucket
  650. int32_t n = s_num[b];
  651. m_hasBackTag = g_nodes [ n ].m_hasBackTag;
  652. m_isBreaking = g_nodes [ n ].m_isBreaking;
  653. m_isVisible = g_nodes [ n ].m_isVisible;
  654. // return the tag/node Id
  655. return g_nodes [ n ].m_nodeId;
  656. }
  657. int32_t getNumXmlNodes ( ) {
  658. return (int32_t)sizeof(g_nodes) / sizeof(XmlNode);
  659. }
  660. #include "Words.h" // BACKBITCOMP
  661. bool isBreakingTagId ( nodeid_t tagId ) {
  662. return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
  663. }
  664. bool hasBackTag ( nodeid_t tagId ) {
  665. return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
  666. }