PageRenderTime 70ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/pdf/pdf_lex.c

https://bitbucket.org/sumidasignage/mupdf
C | 553 lines | 520 code | 24 blank | 9 comment | 89 complexity | 8cdefd7997c32b878200c7482790ca1d MD5 | raw file
  1. #include "fitz-internal.h"
  2. #include "mupdf-internal.h"
  3. #define IS_NUMBER \
  4. '+':case'-':case'.':case'0':case'1':case'2':case'3':\
  5. case'4':case'5':case'6':case'7':case'8':case'9'
  6. #define IS_WHITE \
  7. '\000':case'\011':case'\012':case'\014':case'\015':case'\040'
  8. #define IS_HEX \
  9. '0':case'1':case'2':case'3':case'4':case'5':case'6':\
  10. case'7':case'8':case'9':case'A':case'B':case'C':\
  11. case'D':case'E':case'F':case'a':case'b':case'c':\
  12. case'd':case'e':case'f'
  13. #define IS_DELIM \
  14. '(':case')':case'<':case'>':case'[':case']':case'{':\
  15. case'}':case'/':case'%'
  16. #define RANGE_0_9 \
  17. '0':case'1':case'2':case'3':case'4':case'5':\
  18. case'6':case'7':case'8':case'9'
  19. #define RANGE_a_f \
  20. 'a':case'b':case'c':case'd':case'e':case'f'
  21. #define RANGE_A_F \
  22. 'A':case'B':case'C':case'D':case'E':case'F'
  23. static inline int iswhite(int ch)
  24. {
  25. return
  26. ch == '\000' ||
  27. ch == '\011' ||
  28. ch == '\012' ||
  29. ch == '\014' ||
  30. ch == '\015' ||
  31. ch == '\040';
  32. }
  33. static inline int unhex(int ch)
  34. {
  35. if (ch >= '0' && ch <= '9') return ch - '0';
  36. if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
  37. if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
  38. return 0;
  39. }
  40. static void
  41. lex_white(fz_stream *f)
  42. {
  43. int c;
  44. do {
  45. c = fz_read_byte(f);
  46. } while ((c <= 32) && (iswhite(c)));
  47. if (c != EOF)
  48. fz_unread_byte(f);
  49. }
  50. static void
  51. lex_comment(fz_stream *f)
  52. {
  53. int c;
  54. do {
  55. c = fz_read_byte(f);
  56. } while ((c != '\012') && (c != '\015') && (c != EOF));
  57. }
  58. static int
  59. lex_number(fz_stream *f, pdf_lexbuf *buf, int c)
  60. {
  61. int neg = 0;
  62. int i = 0;
  63. int n;
  64. int d;
  65. float v;
  66. /* Initially we might have +, -, . or a digit */
  67. switch (c)
  68. {
  69. case '.':
  70. goto loop_after_dot;
  71. case '-':
  72. neg = 1;
  73. break;
  74. case '+':
  75. break;
  76. default: /* Must be a digit */
  77. i = c - '0';
  78. break;
  79. }
  80. while (1)
  81. {
  82. c = fz_read_byte(f);
  83. switch (c)
  84. {
  85. case '.':
  86. goto loop_after_dot;
  87. case RANGE_0_9:
  88. i = 10*i + c - '0';
  89. /* FIXME: Need overflow check here; do we care? */
  90. break;
  91. default:
  92. fz_unread_byte(f);
  93. /* Fallthrough */
  94. case EOF:
  95. if (neg)
  96. i = -i;
  97. buf->i = i;
  98. return PDF_TOK_INT;
  99. }
  100. }
  101. /* In here, we've seen a dot, so can accept just digits */
  102. loop_after_dot:
  103. n = 0;
  104. d = 1;
  105. while (1)
  106. {
  107. c = fz_read_byte(f);
  108. switch (c)
  109. {
  110. case RANGE_0_9:
  111. if (d >= INT_MAX/10)
  112. goto underflow;
  113. n = n*10 + (c - '0');
  114. d *= 10;
  115. break;
  116. default:
  117. fz_unread_byte(f);
  118. /* Fallthrough */
  119. case EOF:
  120. v = (float)i + ((float)n / (float)d);
  121. if (neg)
  122. v = -v;
  123. buf->f = v;
  124. return PDF_TOK_REAL;
  125. }
  126. }
  127. underflow:
  128. /* Ignore any digits after here, because they are too small */
  129. while (1)
  130. {
  131. c = fz_read_byte(f);
  132. switch (c)
  133. {
  134. case RANGE_0_9:
  135. break;
  136. default:
  137. fz_unread_byte(f);
  138. /* Fallthrough */
  139. case EOF:
  140. v = (float)i + ((float)n / (float)d);
  141. if (neg)
  142. v = -v;
  143. buf->f = v;
  144. return PDF_TOK_REAL;
  145. }
  146. }
  147. }
  148. static void
  149. lex_name(fz_stream *f, pdf_lexbuf *buf)
  150. {
  151. char *s = buf->scratch;
  152. int n = buf->size;
  153. while (n > 1)
  154. {
  155. int c = fz_read_byte(f);
  156. switch (c)
  157. {
  158. case IS_WHITE:
  159. case IS_DELIM:
  160. fz_unread_byte(f);
  161. goto end;
  162. case EOF:
  163. goto end;
  164. case '#':
  165. {
  166. int d;
  167. c = fz_read_byte(f);
  168. switch (c)
  169. {
  170. case RANGE_0_9:
  171. d = (c - '0') << 4;
  172. break;
  173. case RANGE_a_f:
  174. d = (c - 'a' + 10) << 4;
  175. break;
  176. case RANGE_A_F:
  177. d = (c - 'A' + 10) << 4;
  178. break;
  179. default:
  180. fz_unread_byte(f);
  181. /* fallthrough */
  182. case EOF:
  183. goto end;
  184. }
  185. c = fz_read_byte(f);
  186. switch (c)
  187. {
  188. case RANGE_0_9:
  189. c -= '0';
  190. break;
  191. case RANGE_a_f:
  192. c -= 'a' - 10;
  193. break;
  194. case RANGE_A_F:
  195. c -= 'A' - 10;
  196. break;
  197. default:
  198. fz_unread_byte(f);
  199. /* fallthrough */
  200. case EOF:
  201. *s++ = d;
  202. n--;
  203. goto end;
  204. }
  205. *s++ = d + c;
  206. n--;
  207. break;
  208. }
  209. default:
  210. *s++ = c;
  211. n--;
  212. break;
  213. }
  214. }
  215. end:
  216. *s = '\0';
  217. buf->len = s - buf->scratch;
  218. }
  219. static int
  220. lex_string(fz_stream *f, pdf_lexbuf *lb)
  221. {
  222. char *s = lb->scratch;
  223. char *e = s + lb->size;
  224. int bal = 1;
  225. int oct;
  226. int c;
  227. while (1)
  228. {
  229. if (s == e)
  230. {
  231. s += pdf_lexbuf_grow(lb);
  232. e = lb->scratch + lb->size;
  233. }
  234. c = fz_read_byte(f);
  235. switch (c)
  236. {
  237. case EOF:
  238. goto end;
  239. case '(':
  240. bal++;
  241. *s++ = c;
  242. break;
  243. case ')':
  244. bal --;
  245. if (bal == 0)
  246. goto end;
  247. *s++ = c;
  248. break;
  249. case '\\':
  250. c = fz_read_byte(f);
  251. switch (c)
  252. {
  253. case EOF:
  254. goto end;
  255. case 'n':
  256. *s++ = '\n';
  257. break;
  258. case 'r':
  259. *s++ = '\r';
  260. break;
  261. case 't':
  262. *s++ = '\t';
  263. break;
  264. case 'b':
  265. *s++ = '\b';
  266. break;
  267. case 'f':
  268. *s++ = '\f';
  269. break;
  270. case '(':
  271. *s++ = '(';
  272. break;
  273. case ')':
  274. *s++ = ')';
  275. break;
  276. case '\\':
  277. *s++ = '\\';
  278. break;
  279. case RANGE_0_9:
  280. oct = c - '0';
  281. c = fz_read_byte(f);
  282. if (c >= '0' && c <= '9')
  283. {
  284. oct = oct * 8 + (c - '0');
  285. c = fz_read_byte(f);
  286. if (c >= '0' && c <= '9')
  287. oct = oct * 8 + (c - '0');
  288. else if (c != EOF)
  289. fz_unread_byte(f);
  290. }
  291. else if (c != EOF)
  292. fz_unread_byte(f);
  293. *s++ = oct;
  294. break;
  295. case '\n':
  296. break;
  297. case '\r':
  298. c = fz_read_byte(f);
  299. if ((c != '\n') && (c != EOF))
  300. fz_unread_byte(f);
  301. break;
  302. default:
  303. *s++ = c;
  304. }
  305. break;
  306. default:
  307. *s++ = c;
  308. break;
  309. }
  310. }
  311. end:
  312. lb->len = s - lb->scratch;
  313. return PDF_TOK_STRING;
  314. }
  315. static int
  316. lex_hex_string(fz_stream *f, pdf_lexbuf *lb)
  317. {
  318. char *s = lb->scratch;
  319. char *e = s + lb->size;
  320. int a = 0, x = 0;
  321. int c;
  322. while (1)
  323. {
  324. if (s == e)
  325. {
  326. s += pdf_lexbuf_grow(lb);
  327. e = lb->scratch + lb->size;
  328. }
  329. c = fz_read_byte(f);
  330. switch (c)
  331. {
  332. case IS_WHITE:
  333. break;
  334. case IS_HEX:
  335. if (x)
  336. {
  337. *s++ = a * 16 + unhex(c);
  338. x = !x;
  339. }
  340. else
  341. {
  342. a = unhex(c);
  343. x = !x;
  344. }
  345. break;
  346. case '>':
  347. case EOF:
  348. goto end;
  349. default:
  350. fz_warn(f->ctx, "ignoring invalid character in hex string: '%c'", c);
  351. }
  352. }
  353. end:
  354. lb->len = s - lb->scratch;
  355. return PDF_TOK_STRING;
  356. }
  357. static int
  358. pdf_token_from_keyword(char *key)
  359. {
  360. switch (*key)
  361. {
  362. case 'R':
  363. if (!strcmp(key, "R")) return PDF_TOK_R;
  364. break;
  365. case 't':
  366. if (!strcmp(key, "true")) return PDF_TOK_TRUE;
  367. if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
  368. break;
  369. case 'f':
  370. if (!strcmp(key, "false")) return PDF_TOK_FALSE;
  371. break;
  372. case 'n':
  373. if (!strcmp(key, "null")) return PDF_TOK_NULL;
  374. break;
  375. case 'o':
  376. if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
  377. break;
  378. case 'e':
  379. if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
  380. if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
  381. break;
  382. case 's':
  383. if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
  384. if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
  385. break;
  386. case 'x':
  387. if (!strcmp(key, "xref")) return PDF_TOK_XREF;
  388. break;
  389. default:
  390. break;
  391. }
  392. return PDF_TOK_KEYWORD;
  393. }
  394. void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
  395. {
  396. lb->size = lb->base_size = size;
  397. lb->len = 0;
  398. lb->ctx = ctx;
  399. lb->scratch = &lb->buffer[0];
  400. }
  401. void pdf_lexbuf_fin(pdf_lexbuf *lb)
  402. {
  403. if (lb && lb->size != lb->base_size)
  404. fz_free(lb->ctx, lb->scratch);
  405. }
  406. ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
  407. {
  408. char *old = lb->scratch;
  409. int newsize = lb->size * 2;
  410. if (lb->size == lb->base_size)
  411. {
  412. lb->scratch = fz_malloc(lb->ctx, newsize);
  413. memcpy(lb->scratch, lb->buffer, lb->size);
  414. }
  415. else
  416. {
  417. lb->scratch = fz_resize_array(lb->ctx, lb->scratch, newsize, 1);
  418. }
  419. lb->size = newsize;
  420. return lb->scratch - old;
  421. }
  422. int
  423. pdf_lex(fz_stream *f, pdf_lexbuf *buf)
  424. {
  425. while (1)
  426. {
  427. int c = fz_read_byte(f);
  428. switch (c)
  429. {
  430. case EOF:
  431. return PDF_TOK_EOF;
  432. case IS_WHITE:
  433. lex_white(f);
  434. break;
  435. case '%':
  436. lex_comment(f);
  437. break;
  438. case '/':
  439. lex_name(f, buf);
  440. return PDF_TOK_NAME;
  441. case '(':
  442. return lex_string(f, buf);
  443. case ')':
  444. fz_warn(f->ctx, "lexical error (unexpected ')')");
  445. continue;
  446. case '<':
  447. c = fz_read_byte(f);
  448. if (c == '<')
  449. {
  450. return PDF_TOK_OPEN_DICT;
  451. }
  452. else
  453. {
  454. fz_unread_byte(f);
  455. return lex_hex_string(f, buf);
  456. }
  457. case '>':
  458. c = fz_read_byte(f);
  459. if (c == '>')
  460. {
  461. return PDF_TOK_CLOSE_DICT;
  462. }
  463. fz_warn(f->ctx, "lexical error (unexpected '>')");
  464. continue;
  465. case '[':
  466. return PDF_TOK_OPEN_ARRAY;
  467. case ']':
  468. return PDF_TOK_CLOSE_ARRAY;
  469. case '{':
  470. return PDF_TOK_OPEN_BRACE;
  471. case '}':
  472. return PDF_TOK_CLOSE_BRACE;
  473. case IS_NUMBER:
  474. return lex_number(f, buf, c);
  475. default: /* isregular: !isdelim && !iswhite && c != EOF */
  476. fz_unread_byte(f);
  477. lex_name(f, buf);
  478. return pdf_token_from_keyword(buf->scratch);
  479. }
  480. }
  481. }
  482. void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
  483. {
  484. switch (tok)
  485. {
  486. case PDF_TOK_NAME:
  487. fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch);
  488. break;
  489. case PDF_TOK_STRING:
  490. if (buf->len >= buf->size)
  491. pdf_lexbuf_grow(buf);
  492. buf->scratch[buf->len] = 0;
  493. fz_buffer_cat_pdf_string(ctx, fzbuf, buf->scratch);
  494. break;
  495. case PDF_TOK_OPEN_DICT:
  496. fz_buffer_printf(ctx, fzbuf, "<<");
  497. break;
  498. case PDF_TOK_CLOSE_DICT:
  499. fz_buffer_printf(ctx, fzbuf, ">>");
  500. break;
  501. case PDF_TOK_OPEN_ARRAY:
  502. fz_buffer_printf(ctx, fzbuf, "[");
  503. break;
  504. case PDF_TOK_CLOSE_ARRAY:
  505. fz_buffer_printf(ctx, fzbuf, "]");
  506. break;
  507. case PDF_TOK_OPEN_BRACE:
  508. fz_buffer_printf(ctx, fzbuf, "{");
  509. break;
  510. case PDF_TOK_CLOSE_BRACE:
  511. fz_buffer_printf(ctx, fzbuf, "}");
  512. break;
  513. case PDF_TOK_INT:
  514. fz_buffer_printf(ctx, fzbuf, "%d", buf->i);
  515. break;
  516. case PDF_TOK_REAL:
  517. {
  518. char sbuf[256];
  519. sprintf(sbuf, "%g", buf->f);
  520. if (strchr(sbuf, 'e')) /* bad news! */
  521. sprintf(sbuf, fabsf(buf->f) > 1 ? "%1.1f" : "%1.8f", buf->f);
  522. fz_buffer_printf(ctx, fzbuf, "%s", sbuf);
  523. }
  524. break;
  525. default:
  526. fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch);
  527. break;
  528. }
  529. }