/cgi/html.c
C | 839 lines | 766 code | 29 blank | 44 comment | 244 complexity | 6b47cb857135d485fc3699f2b67f6ce2 MD5 | raw file
Possible License(s): BSD-2-Clause
- /*
- * Copyright 2001-2004 Brandon Long
- * All Rights Reserved.
- *
- * ClearSilver Templating System
- *
- * This code is made available under the terms of the ClearSilver License.
- * http://www.clearsilver.net/license.hdf
- *
- */
- #include "cs_config.h"
- #include <stdlib.h>
- #include <string.h>
- #include <sys/types.h>
- #include <regex.h>
- #include <ctype.h>
- #include "util/neo_misc.h"
- #include "util/neo_err.h"
- #include "util/neo_str.h"
- #include "util/ulocks.h"
- #include "html.h"
- #include "cgi.h"
- static int has_space_formatting(const char *src, int slen)
- {
- int spaces = 0;
- int returns = 0;
- int ascii_art = 0;
- int x = 0;
- for (x = 0; x < slen; x++)
- {
- if (src[x] == '\t') return 1;
- if (src[x] == ' ')
- {
- spaces++;
- if (x && (src[x-1] == '.'))
- spaces--;
- }
- else if (src[x] == '\n')
- {
- spaces = 0;
- returns++;
- }
- else if (strchr ("/\\<>:[]!@#$%^&*()|", src[x]))
- {
- ascii_art++;
- if (ascii_art > 3) return 2;
- }
- else if (src[x] != '\r')
- {
- if (returns > 2) return 1;
- if (spaces > 2) return 1;
- returns = 0;
- spaces = 0;
- ascii_art = 0;
- }
- }
- return 0;
- }
- /*
- static int has_long_lines (char *s, int l)
- {
- char *ptr;
- int x = 0;
- while (x < l)
- {
- ptr = strchr (s + x, '\n');
- if (ptr == NULL)
- {
- if (l - x > 75) return 1;
- return 0;
- }
- if (ptr - (s + x) > 75) return 1;
- x = ptr - s + 1;
- }
- return 0;
- }
- */
- /* The first step is to actually find all of the URLs and email
- * addresses using our handy regular expressions. We then mark these,
- * and then go through convert non-special areas with straight
- * text->html escapes, and convert special parts as special parts
- */
- struct _parts {
- int begin;
- int end;
- int type;
- };
- #define SC_TYPE_TEXT 1
- #define SC_TYPE_URL 2
- #define SC_TYPE_EMAIL 3
- static char *EmailRe = "[^][@:;<>\\\"()[:space:][:cntrl:]]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
- static char *URLRe = "((http|https|ftp|mailto):(//)?[^[:space:]>\"\t]*|www\\.[-a-z0-9\\.]+)[^[:space:];\t\">]*";
- #ifdef HAVE_PTHREADS
- /* In multi-threaded environments, we have to init thread safely */
- static pthread_mutex_t InitLock = PTHREAD_MUTEX_INITIALIZER;
- #endif
- static int CompiledRe = 0;
- static regex_t EmailRegex, UrlRegex;
- static NEOERR *split_and_convert (const char *src, int slen,
- STRING *out, HTML_CONVERT_OPTS *opts)
- {
- NEOERR *err = STATUS_OK;
- regmatch_t email_match, url_match;
- int errcode;
- char *ptr, *esc;
- char errbuf[256];
- struct _parts *parts;
- int part_count;
- int part;
- int x, i;
- int spaces = 0;
- if (!CompiledRe)
- {
- #ifdef HAVE_PTHREADS
- /* In threaded environments, we have to mutex lock to do this regcomp, but
- * we don't want to use a mutex every time to check that it was regcomp.
- * So, we only lock if our first test of compiled was false */
- err = mLock(&InitLock);
- if (err != STATUS_OK) return nerr_pass(err);
- if (CompiledRe == 0) {
- #endif
- if ((errcode = regcomp (&EmailRegex, EmailRe, REG_ICASE | REG_EXTENDED)))
- {
- regerror (errcode, &EmailRegex, errbuf, sizeof(errbuf));
- err = nerr_raise (NERR_PARSE, "Unable to compile EmailRE: %s", errbuf);
- }
- if ((errcode = regcomp (&UrlRegex, URLRe, REG_ICASE | REG_EXTENDED)))
- {
- regerror (errcode, &UrlRegex, errbuf, sizeof(errbuf));
- err = nerr_raise (NERR_PARSE, "Unable to compile URLRe: %s", errbuf);
- }
- CompiledRe = 1;
- #ifdef HAVE_PTHREADS
- }
- if (err) {
- mUnlock(&InitLock);
- return err;
- }
- err = mUnlock(&InitLock);
- if (err != STATUS_OK) return nerr_pass(err);
- #else
- if (err) {
- return err;
- }
- #endif
- }
- part_count = 20;
- parts = (struct _parts *) malloc (sizeof(struct _parts) * part_count);
- part = 0;
- x = 0;
- if (regexec (&EmailRegex, src+x, 1, &email_match, 0) != 0)
- {
- email_match.rm_so = -1;
- email_match.rm_eo = -1;
- }
- else
- {
- email_match.rm_so += x;
- email_match.rm_eo += x;
- }
- if (regexec (&UrlRegex, src+x, 1, &url_match, 0) != 0)
- {
- url_match.rm_so = -1;
- url_match.rm_eo = -1;
- }
- else
- {
- url_match.rm_so += x;
- url_match.rm_eo += x;
- }
- while ((x < slen) && !((email_match.rm_so == -1) && (url_match.rm_so == -1)))
- {
- if (part >= part_count)
- {
- void *new_ptr;
- part_count *= 2;
- new_ptr = realloc (parts, sizeof(struct _parts) * part_count);
- if (new_ptr == NULL) {
- free(parts);
- return nerr_raise (NERR_NOMEM,
- "Unable to increase url matcher to %d urls",
- part_count);
- }
- parts = (struct _parts *) new_ptr;
- }
- if ((url_match.rm_so != -1) && ((email_match.rm_so == -1) || (url_match.rm_so <= email_match.rm_so)))
- {
- parts[part].begin = url_match.rm_so;
- parts[part].end = url_match.rm_eo;
- parts[part].type = SC_TYPE_URL;
- x = parts[part].end + 1;
- part++;
- if (x < slen)
- {
- if (regexec (&UrlRegex, src+x, 1, &url_match, 0) != 0)
- {
- url_match.rm_so = -1;
- url_match.rm_eo = -1;
- }
- else
- {
- url_match.rm_so += x;
- url_match.rm_eo += x;
- }
- if ((email_match.rm_so != -1) && (x > email_match.rm_so))
- {
- if (regexec (&EmailRegex, src+x, 1, &email_match, 0) != 0)
- {
- email_match.rm_so = -1;
- email_match.rm_eo = -1;
- }
- else
- {
- email_match.rm_so += x;
- email_match.rm_eo += x;
- }
- }
- }
- }
- else
- {
- parts[part].begin = email_match.rm_so;
- parts[part].end = email_match.rm_eo;
- parts[part].type = SC_TYPE_EMAIL;
- x = parts[part].end + 1;
- part++;
- if (x < slen)
- {
- if (regexec (&EmailRegex, src+x, 1, &email_match, 0) != 0)
- {
- email_match.rm_so = -1;
- email_match.rm_eo = -1;
- }
- else
- {
- email_match.rm_so += x;
- email_match.rm_eo += x;
- }
- if ((url_match.rm_so != -1) && (x > url_match.rm_so))
- {
- if (regexec (&UrlRegex, src+x, 1, &url_match, 0) != 0)
- {
- url_match.rm_so = -1;
- url_match.rm_eo = -1;
- }
- else
- {
- url_match.rm_so += x;
- url_match.rm_eo += x;
- }
- }
- }
- }
- }
- i = 0;
- x = 0;
- while (x < slen)
- {
- if ((i >= part) || (x < parts[i].begin))
- {
- ptr = strpbrk(src + x, "&<>\r\n ");
- if (ptr == NULL)
- {
- if (spaces)
- {
- int sp;
- for (sp = 0; sp < spaces - 1; sp++)
- {
- err = string_append (out, " ");
- if (err != STATUS_OK) break;
- }
- if (err != STATUS_OK) break;
- err = string_append_char (out, ' ');
- }
- spaces = 0;
- if (i < part)
- {
- err = string_appendn (out, src + x, parts[i].begin - x);
- x = parts[i].begin;
- }
- else
- {
- err = string_append (out, src + x);
- x = slen;
- }
- }
- else
- {
- if ((i >= part) || ((ptr - src) < parts[i].begin))
- {
- if (spaces)
- {
- int sp;
- for (sp = 0; sp < spaces - 1; sp++)
- {
- err = string_append (out, " ");
- if (err != STATUS_OK) break;
- }
- if (err != STATUS_OK) break;
- err = string_append_char (out, ' ');
- }
- spaces = 0;
- err = string_appendn (out, src + x, (ptr - src) - x);
- if (err != STATUS_OK) break;
- x = ptr - src;
- if (src[x] == ' ')
- {
- if (opts->space_convert)
- {
- spaces++;
- }
- else
- err = string_append_char (out, ' ');
- }
- else
- {
- if (src[x] != '\n' && spaces)
- {
- int sp;
- for (sp = 0; sp < spaces - 1; sp++)
- {
- err = string_append (out, " ");
- if (err != STATUS_OK) break;
- }
- if (err != STATUS_OK) break;
- err = string_append_char (out, ' ');
- }
- spaces = 0;
- if (src[x] == '&')
- err = string_append (out, "&");
- else if (src[x] == '<')
- err = string_append (out, "<");
- else if (src[x] == '>')
- err = string_append (out, ">");
- else if (src[x] == '\n')
- if (opts->newlines_convert)
- err = string_append (out, "<br/>\n");
- else if (x && src[x-1] == '\n')
- err = string_append (out, "<p/>\n");
- else
- err = string_append_char (out, '\n');
- else if (src[x] != '\r')
- err = nerr_raise (NERR_ASSERT, "src[x] == '%c'", src[x]);
- }
- x++;
- }
- else
- {
- if (spaces)
- {
- int sp;
- for (sp = 0; sp < spaces - 1; sp++)
- {
- err = string_append (out, " ");
- if (err != STATUS_OK) break;
- }
- if (err != STATUS_OK) break;
- err = string_append_char (out, ' ');
- }
- spaces = 0;
- err = string_appendn (out, src + x, parts[i].begin - x);
- x = parts[i].begin;
- }
- }
- }
- else
- {
- if (spaces)
- {
- int sp;
- for (sp = 0; sp < spaces - 1; sp++)
- {
- err = string_append (out, " ");
- if (err != STATUS_OK) break;
- }
- if (err != STATUS_OK) break;
- err = string_append_char (out, ' ');
- }
- spaces = 0;
- if (parts[i].type == SC_TYPE_URL)
- {
- char last_char = src[parts[i].end-1];
- int suffix=0;
- if (last_char == '.' || last_char == ',') { suffix=1; }
- err = string_append (out, " <a ");
- if (err != STATUS_OK) break;
- if (opts->url_class)
- {
- err = string_appendf (out, "class=%s ", opts->url_class);
- if (err) break;
- }
- if (opts->url_target)
- {
- err = string_appendf (out, "target=\"%s\" ", opts->url_target);
- if (err) break;
- }
- err = string_append(out, "href=\"");
- if (err) break;
- if (opts->bounce_url)
- {
- char *url, *esc_url, *new_url;
- int url_len;
- if (!strncasecmp(src + x, "www.", 4))
- {
- url_len = 7 + parts[i].end - x - suffix;
- url = (char *) malloc(url_len+1);
- if (url == NULL)
- {
- err = nerr_raise(NERR_NOMEM,
- "Unable to allocate memory to convert url");
- break;
- }
- strcpy(url, "http://");
- strncat(url, src + x, parts[i].end - x - suffix);
- }
- else
- {
- url_len = parts[i].end - x - suffix;
- url = (char *) malloc(url_len+1);
- if (url == NULL)
- {
- err = nerr_raise(NERR_NOMEM,
- "Unable to allocate memory to convert url");
- break;
- }
- strncpy(url, src + x, parts[i].end - x - suffix);
- url[url_len] = '\0';
- }
- err = cgi_url_escape(url, &esc_url);
- free(url);
- if (err) {
- free(esc_url);
- break;
- }
- new_url = sprintf_alloc(opts->bounce_url, esc_url);
- free(esc_url);
- if (new_url == NULL)
- {
- err = nerr_raise(NERR_NOMEM, "Unable to allocate memory to convert url");
- break;
- }
- err = string_append (out, new_url);
- free(new_url);
- if (err) break;
- }
- else
- {
- if (!strncasecmp(src + x, "www.", 4))
- {
- err = string_append (out, "http://");
- if (err != STATUS_OK) break;
- }
- err = string_appendn (out, src + x, parts[i].end - x - suffix);
- if (err != STATUS_OK) break;
- }
- err = string_append (out, "\">");
- if (err != STATUS_OK) break;
- if (opts->link_name) {
- err = html_escape_alloc((opts->link_name),
- strlen(opts->link_name), &esc);
- } else {
- err = html_escape_alloc((src + x), parts[i].end - x - suffix, &esc);
- }
- if (err != STATUS_OK) break;
- err = string_append (out, esc);
- free(esc);
- if (err != STATUS_OK) break;
- err = string_append (out, "</a>");
- if (suffix) {
- err = string_appendn(out,src + parts[i].end - 1,1);
- if (err != STATUS_OK) break;
- }
- }
- else /* type == SC_TYPE_EMAIL */
- {
- err = string_append (out, "<a ");
- if (err != STATUS_OK) break;
- if (opts->mailto_class)
- {
- err = string_appendf (out, "class=%s ", opts->mailto_class);
- if (err) break;
- }
- err = string_append(out, "href=\"mailto:");
- if (err) break;
- err = string_appendn (out, src + x, parts[i].end - x);
- if (err != STATUS_OK) break;
- err = string_append (out, "\">");
- if (err != STATUS_OK) break;
- err = html_escape_alloc(src + x, parts[i].end - x, &esc);
- if (err != STATUS_OK) break;
- err = string_append (out, esc);
- free(esc);
- if (err != STATUS_OK) break;
- err = string_append (out, "</a>");
- }
- x = parts[i].end;
- i++;
- }
- if (err != STATUS_OK) break;
- }
- free (parts);
- return err;
- }
- static void strip_white_space_end (STRING *str)
- {
- int x = 0;
- int ol = str->len;
- char *ptr;
- int i;
- while (x < str->len)
- {
- ptr = strchr(str->buf + x, '\n');
- if (ptr == NULL)
- {
- /* just strip the white space at the end of the string */
- ol = strlen(str->buf);
- while (ol && isspace(str->buf[ol-1]))
- {
- str->buf[ol - 1] = '\0';
- ol--;
- }
- str->len = ol;
- return;
- }
- else
- {
- x = i = ptr - str->buf;
- if (x)
- {
- x--;
- while (x && isspace(str->buf[x]) && (str->buf[x] != '\n')) x--;
- if (x) x++;
- memmove (str->buf + x, ptr, ol - i + 1);
- x++;
- str->len -= ((i - x) + 1);
- str->buf[str->len] = '\0';
- ol = str->len;
- }
- }
- }
- }
- NEOERR *convert_text_html_alloc (const char *src, int slen,
- char **out)
- {
- return nerr_pass(convert_text_html_alloc_options(src, slen, out, NULL));
- }
- NEOERR *convert_text_html_alloc_options (const char *src, int slen,
- char **out,
- HTML_CONVERT_OPTS *opts)
- {
- NEOERR *err;
- STRING out_s;
- int formatting = 0;
- HTML_CONVERT_OPTS my_opts;
- string_init(&out_s);
- if (opts == NULL)
- {
- opts = &my_opts;
- opts->bounce_url = NULL;
- opts->url_class = NULL;
- opts->url_target = "_blank";
- opts->mailto_class = NULL;
- opts->long_lines = 0;
- opts->space_convert = 0;
- opts->newlines_convert = 1;
- opts->longline_width = 75; /* This hasn't been used in a while, actually */
- opts->check_ascii_art = 1;
- opts->link_name = NULL;
- }
- do
- {
- if (opts->check_ascii_art)
- {
- formatting = has_space_formatting (src, slen);
- if (formatting) opts->space_convert = 1;
- }
- if (formatting == 2)
- {
- /* Do <pre> formatting */
- opts->newlines_convert = 1;
- err = string_append (&out_s, "<tt>");
- if (err != STATUS_OK) break;
- err = split_and_convert(src, slen, &out_s, opts);
- if (err != STATUS_OK) break;
- err = string_append (&out_s, "</tt>");
- if (err != STATUS_OK) break;
- /* Strip white space at end of lines */
- strip_white_space_end (&out_s);
- }
- else
- {
- /* int nl = has_long_lines (src, slen); */
- err = split_and_convert(src, slen, &out_s, opts);
- }
- } while (0);
- if (err != STATUS_OK)
- {
- string_clear (&out_s);
- return nerr_pass (err);
- }
- if (out_s.buf == NULL)
- {
- *out = strdup("");
- }
- else
- {
- *out = out_s.buf;
- }
- return STATUS_OK;
- }
- NEOERR *html_escape_alloc (const char *src, int slen,
- char **out)
- {
- return nerr_pass(neos_html_escape(src, slen, out));
- }
- /* Replace ampersand with iso-8859-1 character code */
- static unsigned char _expand_amp_8859_1_char (const char *s)
- {
- if (s[0] == '\0')
- return 0;
- switch (s[0]) {
- case '#':
- if (s[1] == 'x') return strtol (s+2, NULL, 16);
- return strtol (s+1, NULL, 10);
- case 'a':
- if (!strcmp(s, "agrave")) return 0xe0; /* ? */
- if (!strcmp(s, "aacute")) return 0xe1; /* á */
- if (!strcmp(s, "acirc")) return 0xe2; /* â */
- if (!strcmp(s, "atilde")) return 0xe3; /* ? */
- if (!strcmp(s, "auml")) return 0xe4; /* ä */
- if (!strcmp(s, "aring")) return 0xe5; /* ? */
- if (!strcmp(s, "aelig")) return 0xe6; /* ? */
- if (!strcmp(s, "amp")) return '&';
- return 0;
- case 'c':
- if (!strcmp(s, "ccedil")) return 0xe7; /* ç */
- return 0;
- case 'e':
- if (!strcmp(s, "egrave")) return 0xe8; /* ? */
- if (!strcmp(s, "eacute")) return 0xe9; /* é */
- if (!strcmp(s, "ecirc")) return 0xea; /* ? */
- if (!strcmp(s, "euml")) return 0xeb; /* ë */
- if (!strcmp(s, "eth")) return 0xf0; /* ? */
- return 0;
- case 'i':
- if (!strcmp(s, "igrave")) return 0xec; /* ? */
- if (!strcmp(s, "iacute")) return 0xed; /* í */
- if (!strcmp(s, "icirc")) return 0xee; /* î */
- if (!strcmp(s, "iuml")) return 0xef; /* ? */
- return 0;
- case 'g':
- if (!strcmp(s, "gt")) return '>';
- return 0;
- case 'l':
- if (!strcmp(s, "lt")) return '<';
- return 0;
- case 'n':
- if (!strcmp(s, "ntilde")) return 0xf1; /* ? */
- if (!strcmp(s, "nbsp")) return ' ';
- return 0;
- case 'o':
- if (!strcmp(s, "ograve")) return 0xf2; /* ? */
- if (!strcmp(s, "oacute")) return 0xf3; /* ó */
- if (!strcmp(s, "ocirc")) return 0xf4; /* ô */
- if (!strcmp(s, "otilde")) return 0xf5; /* ? */
- if (!strcmp(s, "ouml")) return 0xf6; /* ö */
- if (!strcmp(s, "oslash")) return 0xf8; /* ? */
- return 0;
- case 'q': /* quot */
- if (!strcmp(s, "quot")) return '"';
- return 0;
- case 's':
- if (!strcmp(s, "szlig")) return 0xdf; /* ß */
- return 0;
- case 't':
- if (!strcmp(s, "thorn")) return 0xfe; /* ? */
- return 0;
- case 'u':
- if (!strcmp(s, "ugrave")) return 0xf9; /* ? */
- if (!strcmp(s, "uacute")) return 0xfa; /* ú */
- if (!strcmp(s, "ucirc")) return 0xfb; /* ? */
- if (!strcmp(s, "uuml")) return 0xfc; /* ü */
- return 0;
- case 'y':
- if (!strcmp(s, "yacute")) return 0xfd; /* ý */
- }
- return 0;
- }
- char *html_expand_amp_8859_1(const char *amp,
- char *buf)
- {
- unsigned char ch;
- ch = _expand_amp_8859_1_char(amp);
- if (ch == '\0')
- {
- if (!strcmp(amp, "copy")) return "(C)";
- return "";
- }
- else {
- buf[0] = (char)ch;
- buf[1] = '\0';
- return buf;
- }
- }
- NEOERR *html_strip_alloc(const char *src, int slen,
- char **out)
- {
- NEOERR *err = STATUS_OK;
- STRING out_s;
- int x = 0;
- int strip_match = -1;
- int state = 0;
- char amp[10];
- int amp_start = 0;
- char buf[10];
- int ampl = 0;
- string_init(&out_s);
- err = string_append (&out_s, "");
- if (err) return nerr_pass (err);
- while (x < slen)
- {
- switch (state) {
- case 0:
- /* Default */
- if (src[x] == '&')
- {
- state = 3;
- ampl = 0;
- amp_start = x;
- }
- else if (src[x] == '<')
- {
- state = 1;
- }
- else
- {
- if (strip_match == -1)
- {
- err = string_append_char(&out_s, src[x]);
- if (err) break;
- }
- }
- x++;
- break;
- case 1:
- /* Starting TAG */
- if (src[x] == '>')
- {
- state = 0;
- }
- else if (src[x] == '/')
- {
- }
- else
- {
- }
- x++;
- break;
- case 2:
- /* In TAG */
- if (src[x] == '>')
- {
- state = 0;
- }
- x++;
- break;
- case 3:
- /* In AMP */
- if (src[x] == ';')
- {
- amp[ampl] = '\0';
- state = 0;
- err = string_append(&out_s, html_expand_amp_8859_1(amp, buf));
- if (err) break;
- }
- else
- {
- if (ampl < sizeof(amp)-1)
- amp[ampl++] = tolower(src[x]);
- else
- {
- /* broken html... just back up */
- x = amp_start;
- err = string_append_char(&out_s, src[x]);
- if (err) break;
- state = 0;
- }
- }
- x++;
- break;
- }
- if (err) break;
- }
- if (err)
- {
- string_clear (&out_s);
- return nerr_pass (err);
- }
- *out = out_s.buf;
- return STATUS_OK;
- }