PageRenderTime 26ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/document/dom/source.c

https://github.com/nabetaro/elinks
C | 454 lines | 342 code | 90 blank | 22 comment | 72 complexity | cb0c7f36a8211a1792ce41ff1ba86a7f MD5 | raw file
  1. /* DOM-based SGML (HTML) source view renderer (just syntax highlighting :-) */
  2. #ifdef HAVE_CONFIG_H
  3. #include "config.h"
  4. #endif
  5. #include <sys/types.h> /* FreeBSD needs this before regex.h */
  6. #ifdef HAVE_REGEX_H
  7. #include <regex.h>
  8. #endif
  9. #include "elinks.h"
  10. #include "cache/cache.h"
  11. #include "document/css/css.h"
  12. #include "document/css/parser.h"
  13. #include "document/css/property.h"
  14. #include "document/css/stylesheet.h"
  15. #include "document/document.h"
  16. #include "document/dom/renderer.h"
  17. #include "document/dom/util.h"
  18. #include "document/dom/rss.h"
  19. #include "document/renderer.h"
  20. #include "dom/configuration.h"
  21. #include "dom/scanner.h"
  22. #include "dom/sgml/parser.h"
  23. #include "dom/sgml/html/html.h"
  24. #include "dom/sgml/rss/rss.h"
  25. #include "dom/node.h"
  26. #include "dom/stack.h"
  27. #include "intl/charsets.h"
  28. #include "protocol/uri.h"
  29. #include "terminal/draw.h"
  30. #include "util/error.h"
  31. #include "util/memory.h"
  32. #include "util/string.h"
  33. #define check_dom_node_source(renderer, str, len) \
  34. ((renderer)->source <= (str) && (str) + (len) <= (renderer)->end)
  35. #define assert_source(renderer, str, len) \
  36. assertm(check_dom_node_source(renderer, str, len), "renderer[%p : %p] str[%p : %p]", \
  37. (renderer)->source, (renderer)->end, (str), (str) + (len))
  38. #define URL_REGEX "(file://|((f|ht|nt)tp(s)?|smb)://[[:alnum:]]+([-@:.]?[[:alnum:]])*\\.[[:alpha:]]{2,4}(:[[:digit:]]+)?)(/(%[[:xdigit:]]{2}|[-_~&=;?.a-z0-9])*)*"
  39. #define URL_REGFLAGS (REG_ICASE | REG_EXTENDED)
  40. struct source_renderer {
  41. #ifdef HAVE_REGEX_H
  42. regex_t url_regex;
  43. unsigned int find_url:1;
  44. #endif
  45. /* One style per node type. */
  46. struct screen_char styles[DOM_NODES];
  47. };
  48. static inline void
  49. render_dom_flush(struct dom_renderer *renderer, unsigned char *string)
  50. {
  51. struct source_renderer *data = renderer->data;
  52. struct screen_char *template_ = &data->styles[DOM_NODE_TEXT];
  53. int length = string - renderer->position;
  54. assert_source(renderer, renderer->position, 0);
  55. assert_source(renderer, string, 0);
  56. if (length <= 0) return;
  57. render_dom_text(renderer, template_, renderer->position, length);
  58. renderer->position = string;
  59. assert_source(renderer, renderer->position, 0);
  60. }
  61. static inline void
  62. render_dom_node_text(struct dom_renderer *renderer, struct screen_char *template_,
  63. struct dom_node *node)
  64. {
  65. unsigned char *string = node->string.string;
  66. int length = node->string.length;
  67. if (node->type == DOM_NODE_ENTITY_REFERENCE) {
  68. string -= 1;
  69. length += 2;
  70. }
  71. if (check_dom_node_source(renderer, string, length)) {
  72. render_dom_flush(renderer, string);
  73. renderer->position = string + length;
  74. assert_source(renderer, renderer->position, 0);
  75. }
  76. render_dom_text(renderer, template_, string, length);
  77. }
  78. #ifdef HAVE_REGEX_H
  79. static inline void
  80. render_dom_node_enhanced_text(struct dom_renderer *renderer, struct dom_node *node)
  81. {
  82. struct source_renderer *data = renderer->data;
  83. regex_t *regex = &data->url_regex;
  84. regmatch_t regmatch;
  85. unsigned char *string = node->string.string;
  86. int length = node->string.length;
  87. struct screen_char *template_ = &data->styles[node->type];
  88. unsigned char *alloc_string;
  89. if (check_dom_node_source(renderer, string, length)) {
  90. render_dom_flush(renderer, string);
  91. renderer->position = string + length;
  92. assert_source(renderer, renderer->position, 0);
  93. }
  94. alloc_string = memacpy(string, length);
  95. if (alloc_string)
  96. string = alloc_string;
  97. while (length > 0 && !regexec(regex, string, 1, &regmatch, 0)) {
  98. int matchlen = regmatch.rm_eo - regmatch.rm_so;
  99. int offset = regmatch.rm_so;
  100. if (!matchlen || offset < 0 || regmatch.rm_eo > length)
  101. break;
  102. if (offset > 0)
  103. render_dom_text(renderer, template_, string, offset);
  104. string += offset;
  105. length -= offset;
  106. add_dom_link(renderer, string, matchlen, string, matchlen);
  107. length -= matchlen;
  108. string += matchlen;
  109. }
  110. if (length > 0)
  111. render_dom_text(renderer, template_, string, length);
  112. mem_free_if(alloc_string);
  113. }
  114. #endif
  115. static enum dom_code
  116. render_dom_node_source(struct dom_stack *stack, struct dom_node *node, void *xxx)
  117. {
  118. struct dom_renderer *renderer = stack->current->data;
  119. struct source_renderer *data = renderer->data;
  120. assert(node && renderer && renderer->document);
  121. #ifdef HAVE_REGEX_H
  122. if (data->find_url
  123. && (node->type == DOM_NODE_TEXT
  124. || node->type == DOM_NODE_CDATA_SECTION
  125. || node->type == DOM_NODE_COMMENT)) {
  126. render_dom_node_enhanced_text(renderer, node);
  127. } else
  128. #endif
  129. render_dom_node_text(renderer, &data->styles[node->type], node);
  130. return DOM_CODE_OK;
  131. }
  132. /* This callback is also used for rendering processing instruction nodes. */
  133. static enum dom_code
  134. render_dom_element_source(struct dom_stack *stack, struct dom_node *node, void *xxx)
  135. {
  136. struct dom_renderer *renderer = stack->current->data;
  137. struct source_renderer *data = renderer->data;
  138. assert(node && renderer && renderer->document);
  139. render_dom_node_text(renderer, &data->styles[node->type], node);
  140. return DOM_CODE_OK;
  141. }
  142. static enum dom_code
  143. render_dom_element_end_source(struct dom_stack *stack, struct dom_node *node, void *xxx)
  144. {
  145. struct dom_renderer *renderer = stack->current->data;
  146. struct source_renderer *data = renderer->data;
  147. struct dom_stack_state *state = get_dom_stack_top(stack);
  148. struct sgml_parser_state *pstate = get_dom_stack_state_data(stack->contexts[0], state);
  149. struct dom_scanner_token *token = &pstate->end_token;
  150. unsigned char *string = token->string.string;
  151. int length = token->string.length;
  152. assert(node && renderer && renderer->document);
  153. if (!string || !length)
  154. return DOM_CODE_OK;
  155. if (check_dom_node_source(renderer, string, length)) {
  156. render_dom_flush(renderer, string);
  157. renderer->position = string + length;
  158. assert_source(renderer, renderer->position, 0);
  159. }
  160. render_dom_text(renderer, &data->styles[node->type], string, length);
  161. return DOM_CODE_OK;
  162. }
  163. static void
  164. set_base_uri(struct dom_renderer *renderer, unsigned char *value, size_t valuelen)
  165. {
  166. unsigned char *href = memacpy(value, valuelen);
  167. unsigned char *uristring;
  168. struct uri *uri;
  169. if (!href) return;
  170. uristring = join_urls(renderer->base_uri, href);
  171. mem_free(href);
  172. if (!uristring) return;
  173. uri = get_uri(uristring, 0);
  174. mem_free(uristring);
  175. if (!uri) return;
  176. done_uri(renderer->base_uri);
  177. renderer->base_uri = uri;
  178. }
  179. static enum dom_code
  180. render_dom_attribute_source(struct dom_stack *stack, struct dom_node *node, void *xxx)
  181. {
  182. struct dom_renderer *renderer = stack->current->data;
  183. struct source_renderer *data = renderer->data;
  184. struct screen_char *template_ = &data->styles[node->type];
  185. assert(node && renderer->document);
  186. render_dom_node_text(renderer, template_, node);
  187. if (is_dom_string_set(&node->data.attribute.value)) {
  188. int quoted = node->data.attribute.quoted == 1;
  189. unsigned char *value = node->data.attribute.value.string - quoted;
  190. int valuelen = node->data.attribute.value.length + quoted * 2;
  191. if (check_dom_node_source(renderer, value, 0)) {
  192. render_dom_flush(renderer, value);
  193. renderer->position = value + valuelen;
  194. assert_source(renderer, renderer->position, 0);
  195. }
  196. if (node->data.attribute.reference
  197. && valuelen - quoted * 2 > 0) {
  198. int skips;
  199. /* Need to flush the first quoting delimiter and any
  200. * leading whitespace so that the renderers x position
  201. * is at the start of the value string. */
  202. for (skips = 0; skips < valuelen; skips++) {
  203. if ((quoted && skips == 0)
  204. || isspace(value[skips])
  205. || value[skips] < ' ')
  206. continue;
  207. break;
  208. }
  209. if (skips > 0) {
  210. render_dom_text(renderer, template_, value, skips);
  211. value += skips;
  212. valuelen -= skips;
  213. }
  214. /* Figure out what should be skipped after the actual
  215. * link text. */
  216. for (skips = 0; skips < valuelen; skips++) {
  217. if ((quoted && skips == 0)
  218. || isspace(value[valuelen - skips - 1])
  219. || value[valuelen - skips - 1] < ' ')
  220. continue;
  221. break;
  222. }
  223. if (renderer->doctype == SGML_DOCTYPE_HTML
  224. && node->data.attribute.type == HTML_ATTRIBUTE_HREF
  225. && node->parent->data.element.type == HTML_ELEMENT_BASE) {
  226. set_base_uri(renderer, value, valuelen - skips);
  227. }
  228. add_dom_link(renderer, value, valuelen - skips,
  229. value, valuelen - skips);
  230. if (skips > 0) {
  231. value += valuelen - skips;
  232. render_dom_text(renderer, template_, value, skips);
  233. }
  234. } else {
  235. render_dom_text(renderer, template_, value, valuelen);
  236. }
  237. }
  238. return DOM_CODE_OK;
  239. }
  240. static enum dom_code
  241. render_dom_cdata_source(struct dom_stack *stack, struct dom_node *node, void *xxx)
  242. {
  243. struct dom_renderer *renderer = stack->current->data;
  244. struct source_renderer *data = renderer->data;
  245. unsigned char *string = node->string.string;
  246. assert(node && renderer && renderer->document);
  247. /* Highlight the 'CDATA' part of <![CDATA[ if it is there. */
  248. if (check_dom_node_source(renderer, string - 6, 6)) {
  249. render_dom_flush(renderer, string - 6);
  250. render_dom_text(renderer, &data->styles[DOM_NODE_ATTRIBUTE], string - 6, 5);
  251. renderer->position = string - 1;
  252. assert_source(renderer, renderer->position, 0);
  253. }
  254. render_dom_node_text(renderer, &data->styles[node->type], node);
  255. return DOM_CODE_OK;
  256. }
  257. static enum dom_code
  258. render_dom_document_start(struct dom_stack *stack, struct dom_node *node, void *xxx)
  259. {
  260. struct dom_renderer *renderer = stack->current->data;
  261. struct document *document = renderer->document;
  262. struct source_renderer *data;
  263. int type;
  264. struct css_stylesheet *css = &default_stylesheet;
  265. {
  266. static int i_want_struct_module_for_dom;
  267. if (!i_want_struct_module_for_dom) {
  268. static const unsigned char default_colors[] =
  269. "document { color: yellow } "
  270. "element { color: lightgreen } "
  271. "entity-reference { color: red } "
  272. "proc-instruction { color: red } "
  273. "attribute { color: magenta } "
  274. "comment { color: aqua } "
  275. "cdata-section { color: orange2 } ";
  276. i_want_struct_module_for_dom = 1;
  277. /* When someone will get here earlier than at 4am,
  278. * this will be done in some init function, perhaps
  279. * not overriding the user's default stylesheet. */
  280. css_parse_stylesheet(css, NULL, default_colors,
  281. default_colors + sizeof(default_colors));
  282. }
  283. }
  284. data = renderer->data = mem_calloc(1, sizeof(*data));
  285. /* Initialize styles for all the DOM node types. */
  286. for (type = 0; type < DOM_NODES; type++) {
  287. struct screen_char *template_ = &data->styles[type];
  288. struct dom_string *name = get_dom_node_type_name(type);
  289. struct css_selector *selector = NULL;
  290. if (name && is_dom_string_set(name))
  291. selector = find_css_selector(&css->selectors,
  292. CST_ELEMENT, CSR_ROOT,
  293. name->string, name->length);
  294. init_template_by_style(template_, &document->options,
  295. selector ? &selector->properties : NULL);
  296. }
  297. #ifdef HAVE_REGEX_H
  298. if (document->options.plain_display_links) {
  299. if (regcomp(&data->url_regex, URL_REGEX, URL_REGFLAGS)) {
  300. regfree(&data->url_regex);
  301. } else {
  302. data->find_url = 1;
  303. }
  304. }
  305. #endif
  306. return DOM_CODE_OK;
  307. }
  308. static enum dom_code
  309. render_dom_document_end(struct dom_stack *stack, struct dom_node *node, void *xxx)
  310. {
  311. struct dom_renderer *renderer = stack->current->data;
  312. struct source_renderer *data = renderer->data;
  313. /* If there are no non-element nodes after the last element node make
  314. * sure that we flush to the end of the cache entry source including
  315. * the '>' of the last element tag if it has one. (bug 519) */
  316. if (check_dom_node_source(renderer, renderer->position, 0)) {
  317. render_dom_flush(renderer, renderer->end);
  318. }
  319. #ifdef HAVE_REGEX_H
  320. if (data->find_url)
  321. regfree(&data->url_regex);
  322. #endif
  323. mem_free(data);
  324. /* It is not necessary to return DOM_CODE_FREE_NODE here.
  325. * Because the parser was created with the SGML_PARSER_STREAM
  326. * type, the stack has the DOM_STACK_FLAG_FREE_NODES flag and
  327. * implicitly frees all nodes popped from it. */
  328. return DOM_CODE_OK;
  329. }
  330. struct dom_stack_context_info dom_source_renderer_context_info = {
  331. /* Object size: */ 0,
  332. /* Push: */
  333. {
  334. /* */ NULL,
  335. /* DOM_NODE_ELEMENT */ render_dom_element_source,
  336. /* DOM_NODE_ATTRIBUTE */ render_dom_attribute_source,
  337. /* DOM_NODE_TEXT */ render_dom_node_source,
  338. /* DOM_NODE_CDATA_SECTION */ render_dom_cdata_source,
  339. /* DOM_NODE_ENTITY_REFERENCE */ render_dom_node_source,
  340. /* DOM_NODE_ENTITY */ render_dom_node_source,
  341. /* DOM_NODE_PROC_INSTRUCTION */ render_dom_element_source,
  342. /* DOM_NODE_COMMENT */ render_dom_node_source,
  343. /* DOM_NODE_DOCUMENT */ render_dom_document_start,
  344. /* DOM_NODE_DOCUMENT_TYPE */ render_dom_node_source,
  345. /* DOM_NODE_DOCUMENT_FRAGMENT */ render_dom_node_source,
  346. /* DOM_NODE_NOTATION */ render_dom_node_source,
  347. },
  348. /* Pop: */
  349. {
  350. /* */ NULL,
  351. /* DOM_NODE_ELEMENT */ render_dom_element_end_source,
  352. /* DOM_NODE_ATTRIBUTE */ NULL,
  353. /* DOM_NODE_TEXT */ NULL,
  354. /* DOM_NODE_CDATA_SECTION */ NULL,
  355. /* DOM_NODE_ENTITY_REFERENCE */ NULL,
  356. /* DOM_NODE_ENTITY */ NULL,
  357. /* DOM_NODE_PROC_INSTRUCTION */ NULL,
  358. /* DOM_NODE_COMMENT */ NULL,
  359. /* DOM_NODE_DOCUMENT */ render_dom_document_end,
  360. /* DOM_NODE_DOCUMENT_TYPE */ NULL,
  361. /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
  362. /* DOM_NODE_NOTATION */ NULL,
  363. }
  364. };