/pdf/pdf_lex.c
C | 553 lines | 520 code | 24 blank | 9 comment | 89 complexity | 8cdefd7997c32b878200c7482790ca1d MD5 | raw file
- #include "fitz-internal.h"
- #include "mupdf-internal.h"
- #define IS_NUMBER \
- '+':case'-':case'.':case'0':case'1':case'2':case'3':\
- case'4':case'5':case'6':case'7':case'8':case'9'
- #define IS_WHITE \
- '\000':case'\011':case'\012':case'\014':case'\015':case'\040'
- #define IS_HEX \
- '0':case'1':case'2':case'3':case'4':case'5':case'6':\
- case'7':case'8':case'9':case'A':case'B':case'C':\
- case'D':case'E':case'F':case'a':case'b':case'c':\
- case'd':case'e':case'f'
- #define IS_DELIM \
- '(':case')':case'<':case'>':case'[':case']':case'{':\
- case'}':case'/':case'%'
- #define RANGE_0_9 \
- '0':case'1':case'2':case'3':case'4':case'5':\
- case'6':case'7':case'8':case'9'
- #define RANGE_a_f \
- 'a':case'b':case'c':case'd':case'e':case'f'
- #define RANGE_A_F \
- 'A':case'B':case'C':case'D':case'E':case'F'
- static inline int iswhite(int ch)
- {
- return
- ch == '\000' ||
- ch == '\011' ||
- ch == '\012' ||
- ch == '\014' ||
- ch == '\015' ||
- ch == '\040';
- }
- static inline int unhex(int ch)
- {
- if (ch >= '0' && ch <= '9') return ch - '0';
- if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
- if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
- return 0;
- }
- static void
- lex_white(fz_stream *f)
- {
- int c;
- do {
- c = fz_read_byte(f);
- } while ((c <= 32) && (iswhite(c)));
- if (c != EOF)
- fz_unread_byte(f);
- }
- static void
- lex_comment(fz_stream *f)
- {
- int c;
- do {
- c = fz_read_byte(f);
- } while ((c != '\012') && (c != '\015') && (c != EOF));
- }
- static int
- lex_number(fz_stream *f, pdf_lexbuf *buf, int c)
- {
- int neg = 0;
- int i = 0;
- int n;
- int d;
- float v;
- /* Initially we might have +, -, . or a digit */
- switch (c)
- {
- case '.':
- goto loop_after_dot;
- case '-':
- neg = 1;
- break;
- case '+':
- break;
- default: /* Must be a digit */
- i = c - '0';
- break;
- }
- while (1)
- {
- c = fz_read_byte(f);
- switch (c)
- {
- case '.':
- goto loop_after_dot;
- case RANGE_0_9:
- i = 10*i + c - '0';
- /* FIXME: Need overflow check here; do we care? */
- break;
- default:
- fz_unread_byte(f);
- /* Fallthrough */
- case EOF:
- if (neg)
- i = -i;
- buf->i = i;
- return PDF_TOK_INT;
- }
- }
- /* In here, we've seen a dot, so can accept just digits */
- loop_after_dot:
- n = 0;
- d = 1;
- while (1)
- {
- c = fz_read_byte(f);
- switch (c)
- {
- case RANGE_0_9:
- if (d >= INT_MAX/10)
- goto underflow;
- n = n*10 + (c - '0');
- d *= 10;
- break;
- default:
- fz_unread_byte(f);
- /* Fallthrough */
- case EOF:
- v = (float)i + ((float)n / (float)d);
- if (neg)
- v = -v;
- buf->f = v;
- return PDF_TOK_REAL;
- }
- }
- underflow:
- /* Ignore any digits after here, because they are too small */
- while (1)
- {
- c = fz_read_byte(f);
- switch (c)
- {
- case RANGE_0_9:
- break;
- default:
- fz_unread_byte(f);
- /* Fallthrough */
- case EOF:
- v = (float)i + ((float)n / (float)d);
- if (neg)
- v = -v;
- buf->f = v;
- return PDF_TOK_REAL;
- }
- }
- }
- static void
- lex_name(fz_stream *f, pdf_lexbuf *buf)
- {
- char *s = buf->scratch;
- int n = buf->size;
- while (n > 1)
- {
- int c = fz_read_byte(f);
- switch (c)
- {
- case IS_WHITE:
- case IS_DELIM:
- fz_unread_byte(f);
- goto end;
- case EOF:
- goto end;
- case '#':
- {
- int d;
- c = fz_read_byte(f);
- switch (c)
- {
- case RANGE_0_9:
- d = (c - '0') << 4;
- break;
- case RANGE_a_f:
- d = (c - 'a' + 10) << 4;
- break;
- case RANGE_A_F:
- d = (c - 'A' + 10) << 4;
- break;
- default:
- fz_unread_byte(f);
- /* fallthrough */
- case EOF:
- goto end;
- }
- c = fz_read_byte(f);
- switch (c)
- {
- case RANGE_0_9:
- c -= '0';
- break;
- case RANGE_a_f:
- c -= 'a' - 10;
- break;
- case RANGE_A_F:
- c -= 'A' - 10;
- break;
- default:
- fz_unread_byte(f);
- /* fallthrough */
- case EOF:
- *s++ = d;
- n--;
- goto end;
- }
- *s++ = d + c;
- n--;
- break;
- }
- default:
- *s++ = c;
- n--;
- break;
- }
- }
- end:
- *s = '\0';
- buf->len = s - buf->scratch;
- }
- static int
- lex_string(fz_stream *f, pdf_lexbuf *lb)
- {
- char *s = lb->scratch;
- char *e = s + lb->size;
- int bal = 1;
- int oct;
- int c;
- while (1)
- {
- if (s == e)
- {
- s += pdf_lexbuf_grow(lb);
- e = lb->scratch + lb->size;
- }
- c = fz_read_byte(f);
- switch (c)
- {
- case EOF:
- goto end;
- case '(':
- bal++;
- *s++ = c;
- break;
- case ')':
- bal --;
- if (bal == 0)
- goto end;
- *s++ = c;
- break;
- case '\\':
- c = fz_read_byte(f);
- switch (c)
- {
- case EOF:
- goto end;
- case 'n':
- *s++ = '\n';
- break;
- case 'r':
- *s++ = '\r';
- break;
- case 't':
- *s++ = '\t';
- break;
- case 'b':
- *s++ = '\b';
- break;
- case 'f':
- *s++ = '\f';
- break;
- case '(':
- *s++ = '(';
- break;
- case ')':
- *s++ = ')';
- break;
- case '\\':
- *s++ = '\\';
- break;
- case RANGE_0_9:
- oct = c - '0';
- c = fz_read_byte(f);
- if (c >= '0' && c <= '9')
- {
- oct = oct * 8 + (c - '0');
- c = fz_read_byte(f);
- if (c >= '0' && c <= '9')
- oct = oct * 8 + (c - '0');
- else if (c != EOF)
- fz_unread_byte(f);
- }
- else if (c != EOF)
- fz_unread_byte(f);
- *s++ = oct;
- break;
- case '\n':
- break;
- case '\r':
- c = fz_read_byte(f);
- if ((c != '\n') && (c != EOF))
- fz_unread_byte(f);
- break;
- default:
- *s++ = c;
- }
- break;
- default:
- *s++ = c;
- break;
- }
- }
- end:
- lb->len = s - lb->scratch;
- return PDF_TOK_STRING;
- }
- static int
- lex_hex_string(fz_stream *f, pdf_lexbuf *lb)
- {
- char *s = lb->scratch;
- char *e = s + lb->size;
- int a = 0, x = 0;
- int c;
- while (1)
- {
- if (s == e)
- {
- s += pdf_lexbuf_grow(lb);
- e = lb->scratch + lb->size;
- }
- c = fz_read_byte(f);
- switch (c)
- {
- case IS_WHITE:
- break;
- case IS_HEX:
- if (x)
- {
- *s++ = a * 16 + unhex(c);
- x = !x;
- }
- else
- {
- a = unhex(c);
- x = !x;
- }
- break;
- case '>':
- case EOF:
- goto end;
- default:
- fz_warn(f->ctx, "ignoring invalid character in hex string: '%c'", c);
- }
- }
- end:
- lb->len = s - lb->scratch;
- return PDF_TOK_STRING;
- }
- static int
- pdf_token_from_keyword(char *key)
- {
- switch (*key)
- {
- case 'R':
- if (!strcmp(key, "R")) return PDF_TOK_R;
- break;
- case 't':
- if (!strcmp(key, "true")) return PDF_TOK_TRUE;
- if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
- break;
- case 'f':
- if (!strcmp(key, "false")) return PDF_TOK_FALSE;
- break;
- case 'n':
- if (!strcmp(key, "null")) return PDF_TOK_NULL;
- break;
- case 'o':
- if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
- break;
- case 'e':
- if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
- if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
- break;
- case 's':
- if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
- if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
- break;
- case 'x':
- if (!strcmp(key, "xref")) return PDF_TOK_XREF;
- break;
- default:
- break;
- }
- return PDF_TOK_KEYWORD;
- }
- void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
- {
- lb->size = lb->base_size = size;
- lb->len = 0;
- lb->ctx = ctx;
- lb->scratch = &lb->buffer[0];
- }
- void pdf_lexbuf_fin(pdf_lexbuf *lb)
- {
- if (lb && lb->size != lb->base_size)
- fz_free(lb->ctx, lb->scratch);
- }
- ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
- {
- char *old = lb->scratch;
- int newsize = lb->size * 2;
- if (lb->size == lb->base_size)
- {
- lb->scratch = fz_malloc(lb->ctx, newsize);
- memcpy(lb->scratch, lb->buffer, lb->size);
- }
- else
- {
- lb->scratch = fz_resize_array(lb->ctx, lb->scratch, newsize, 1);
- }
- lb->size = newsize;
- return lb->scratch - old;
- }
- int
- pdf_lex(fz_stream *f, pdf_lexbuf *buf)
- {
- while (1)
- {
- int c = fz_read_byte(f);
- switch (c)
- {
- case EOF:
- return PDF_TOK_EOF;
- case IS_WHITE:
- lex_white(f);
- break;
- case '%':
- lex_comment(f);
- break;
- case '/':
- lex_name(f, buf);
- return PDF_TOK_NAME;
- case '(':
- return lex_string(f, buf);
- case ')':
- fz_warn(f->ctx, "lexical error (unexpected ')')");
- continue;
- case '<':
- c = fz_read_byte(f);
- if (c == '<')
- {
- return PDF_TOK_OPEN_DICT;
- }
- else
- {
- fz_unread_byte(f);
- return lex_hex_string(f, buf);
- }
- case '>':
- c = fz_read_byte(f);
- if (c == '>')
- {
- return PDF_TOK_CLOSE_DICT;
- }
- fz_warn(f->ctx, "lexical error (unexpected '>')");
- continue;
- case '[':
- return PDF_TOK_OPEN_ARRAY;
- case ']':
- return PDF_TOK_CLOSE_ARRAY;
- case '{':
- return PDF_TOK_OPEN_BRACE;
- case '}':
- return PDF_TOK_CLOSE_BRACE;
- case IS_NUMBER:
- return lex_number(f, buf, c);
- default: /* isregular: !isdelim && !iswhite && c != EOF */
- fz_unread_byte(f);
- lex_name(f, buf);
- return pdf_token_from_keyword(buf->scratch);
- }
- }
- }
- void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
- {
- switch (tok)
- {
- case PDF_TOK_NAME:
- fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch);
- break;
- case PDF_TOK_STRING:
- if (buf->len >= buf->size)
- pdf_lexbuf_grow(buf);
- buf->scratch[buf->len] = 0;
- fz_buffer_cat_pdf_string(ctx, fzbuf, buf->scratch);
- break;
- case PDF_TOK_OPEN_DICT:
- fz_buffer_printf(ctx, fzbuf, "<<");
- break;
- case PDF_TOK_CLOSE_DICT:
- fz_buffer_printf(ctx, fzbuf, ">>");
- break;
- case PDF_TOK_OPEN_ARRAY:
- fz_buffer_printf(ctx, fzbuf, "[");
- break;
- case PDF_TOK_CLOSE_ARRAY:
- fz_buffer_printf(ctx, fzbuf, "]");
- break;
- case PDF_TOK_OPEN_BRACE:
- fz_buffer_printf(ctx, fzbuf, "{");
- break;
- case PDF_TOK_CLOSE_BRACE:
- fz_buffer_printf(ctx, fzbuf, "}");
- break;
- case PDF_TOK_INT:
- fz_buffer_printf(ctx, fzbuf, "%d", buf->i);
- break;
- case PDF_TOK_REAL:
- {
- char sbuf[256];
- sprintf(sbuf, "%g", buf->f);
- if (strchr(sbuf, 'e')) /* bad news! */
- sprintf(sbuf, fabsf(buf->f) > 1 ? "%1.1f" : "%1.8f", buf->f);
- fz_buffer_printf(ctx, fzbuf, "%s", sbuf);
- }
- break;
- default:
- fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch);
- break;
- }
- }