/pdf/pdf_parse.c

https://github.com/mescher/mupdf · C · 601 lines · 527 code · 66 blank · 8 comment · 129 complexity · 02e935a501d12da22654c2797ace55bc MD5 · raw file

  1. #include "fitz.h"
  2. #include "mupdf.h"
  3. fz_rect
  4. pdf_to_rect(fz_obj *array)
  5. {
  6. fz_rect r;
  7. float a = fz_to_real(fz_array_get(array, 0));
  8. float b = fz_to_real(fz_array_get(array, 1));
  9. float c = fz_to_real(fz_array_get(array, 2));
  10. float d = fz_to_real(fz_array_get(array, 3));
  11. r.x0 = MIN(a, c);
  12. r.y0 = MIN(b, d);
  13. r.x1 = MAX(a, c);
  14. r.y1 = MAX(b, d);
  15. return r;
  16. }
  17. fz_matrix
  18. pdf_to_matrix(fz_obj *array)
  19. {
  20. fz_matrix m;
  21. m.a = fz_to_real(fz_array_get(array, 0));
  22. m.b = fz_to_real(fz_array_get(array, 1));
  23. m.c = fz_to_real(fz_array_get(array, 2));
  24. m.d = fz_to_real(fz_array_get(array, 3));
  25. m.e = fz_to_real(fz_array_get(array, 4));
  26. m.f = fz_to_real(fz_array_get(array, 5));
  27. return m;
  28. }
  29. /* Convert Unicode/PdfDocEncoding string into utf-8 */
  30. char *
  31. pdf_to_utf8(fz_obj *src)
  32. {
  33. unsigned char *srcptr = (unsigned char *) fz_to_str_buf(src);
  34. char *dstptr, *dst;
  35. int srclen = fz_to_str_len(src);
  36. int dstlen = 0;
  37. int ucs;
  38. int i;
  39. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  40. {
  41. for (i = 2; i < srclen; i += 2)
  42. {
  43. ucs = srcptr[i] << 8 | srcptr[i+1];
  44. dstlen += runelen(ucs);
  45. }
  46. dstptr = dst = fz_malloc(dstlen + 1);
  47. for (i = 2; i < srclen; i += 2)
  48. {
  49. ucs = srcptr[i] << 8 | srcptr[i+1];
  50. dstptr += runetochar(dstptr, &ucs);
  51. }
  52. }
  53. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  54. {
  55. for (i = 2; i + 1 < srclen; i += 2)
  56. {
  57. ucs = srcptr[i] | srcptr[i+1] << 8;
  58. dstlen += runelen(ucs);
  59. }
  60. dstptr = dst = fz_malloc(dstlen + 1);
  61. for (i = 2; i + 1 < srclen; i += 2)
  62. {
  63. ucs = srcptr[i] | srcptr[i+1] << 8;
  64. dstptr += runetochar(dstptr, &ucs);
  65. }
  66. }
  67. else
  68. {
  69. for (i = 0; i < srclen; i++)
  70. dstlen += runelen(pdf_doc_encoding[srcptr[i]]);
  71. dstptr = dst = fz_malloc(dstlen + 1);
  72. for (i = 0; i < srclen; i++)
  73. {
  74. ucs = pdf_doc_encoding[srcptr[i]];
  75. dstptr += runetochar(dstptr, &ucs);
  76. }
  77. }
  78. *dstptr = '\0';
  79. return dst;
  80. }
  81. /* Convert Unicode/PdfDocEncoding string into ucs-2 */
  82. unsigned short *
  83. pdf_to_ucs2(fz_obj *src)
  84. {
  85. unsigned char *srcptr = (unsigned char *) fz_to_str_buf(src);
  86. unsigned short *dstptr, *dst;
  87. int srclen = fz_to_str_len(src);
  88. int i;
  89. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  90. {
  91. dstptr = dst = fz_calloc((srclen - 2) / 2 + 1, sizeof(short));
  92. for (i = 2; i + 1 < srclen; i += 2)
  93. *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
  94. }
  95. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  96. {
  97. dstptr = dst = fz_calloc((srclen - 2) / 2 + 1, sizeof(short));
  98. for (i = 2; i + 1 < srclen; i += 2)
  99. *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
  100. }
  101. else
  102. {
  103. dstptr = dst = fz_calloc(srclen + 1, sizeof(short));
  104. for (i = 0; i < srclen; i++)
  105. *dstptr++ = pdf_doc_encoding[srcptr[i]];
  106. }
  107. *dstptr = '\0';
  108. return dst;
  109. }
  110. /* Convert UCS-2 string into PdfDocEncoding for authentication */
  111. char *
  112. pdf_from_ucs2(unsigned short *src)
  113. {
  114. int i, j, len;
  115. char *docstr;
  116. len = 0;
  117. while (src[len])
  118. len++;
  119. docstr = fz_malloc(len + 1);
  120. for (i = 0; i < len; i++)
  121. {
  122. /* shortcut: check if the character has the same code point in both encodings */
  123. if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
  124. docstr[i] = src[i];
  125. continue;
  126. }
  127. /* search through pdf_docencoding for the character's code point */
  128. for (j = 0; j < 256; j++)
  129. if (pdf_doc_encoding[j] == src[i])
  130. break;
  131. docstr[i] = j;
  132. /* fail, if a character can't be encoded */
  133. if (!docstr[i])
  134. {
  135. fz_free(docstr);
  136. return NULL;
  137. }
  138. }
  139. docstr[len] = '\0';
  140. return docstr;
  141. }
  142. fz_obj *
  143. pdf_to_utf8_name(fz_obj *src)
  144. {
  145. char *buf = pdf_to_utf8(src);
  146. fz_obj *dst = fz_new_name(buf);
  147. fz_free(buf);
  148. return dst;
  149. }
  150. fz_error
  151. pdf_parse_array(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
  152. {
  153. fz_error error = fz_okay;
  154. fz_obj *ary = NULL;
  155. fz_obj *obj = NULL;
  156. int a = 0, b = 0, n = 0;
  157. int tok;
  158. int len;
  159. ary = fz_new_array(4);
  160. while (1)
  161. {
  162. error = pdf_lex(&tok, file, buf, cap, &len);
  163. if (error)
  164. {
  165. fz_drop_obj(ary);
  166. return fz_rethrow(error, "cannot parse array");
  167. }
  168. if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
  169. {
  170. if (n > 0)
  171. {
  172. obj = fz_new_int(a);
  173. fz_array_push(ary, obj);
  174. fz_drop_obj(obj);
  175. }
  176. if (n > 1)
  177. {
  178. obj = fz_new_int(b);
  179. fz_array_push(ary, obj);
  180. fz_drop_obj(obj);
  181. }
  182. n = 0;
  183. }
  184. if (tok == PDF_TOK_INT && n == 2)
  185. {
  186. obj = fz_new_int(a);
  187. fz_array_push(ary, obj);
  188. fz_drop_obj(obj);
  189. a = b;
  190. n --;
  191. }
  192. switch (tok)
  193. {
  194. case PDF_TOK_CLOSE_ARRAY:
  195. *op = ary;
  196. return fz_okay;
  197. case PDF_TOK_INT:
  198. if (n == 0)
  199. a = atoi(buf);
  200. if (n == 1)
  201. b = atoi(buf);
  202. n ++;
  203. break;
  204. case PDF_TOK_R:
  205. if (n != 2)
  206. {
  207. fz_drop_obj(ary);
  208. return fz_throw("cannot parse indirect reference in array");
  209. }
  210. obj = fz_new_indirect(a, b, xref);
  211. fz_array_push(ary, obj);
  212. fz_drop_obj(obj);
  213. n = 0;
  214. break;
  215. case PDF_TOK_OPEN_ARRAY:
  216. error = pdf_parse_array(&obj, xref, file, buf, cap);
  217. if (error)
  218. {
  219. fz_drop_obj(ary);
  220. return fz_rethrow(error, "cannot parse array");
  221. }
  222. fz_array_push(ary, obj);
  223. fz_drop_obj(obj);
  224. break;
  225. case PDF_TOK_OPEN_DICT:
  226. error = pdf_parse_dict(&obj, xref, file, buf, cap);
  227. if (error)
  228. {
  229. fz_drop_obj(ary);
  230. return fz_rethrow(error, "cannot parse array");
  231. }
  232. fz_array_push(ary, obj);
  233. fz_drop_obj(obj);
  234. break;
  235. case PDF_TOK_NAME:
  236. obj = fz_new_name(buf);
  237. fz_array_push(ary, obj);
  238. fz_drop_obj(obj);
  239. break;
  240. case PDF_TOK_REAL:
  241. obj = fz_new_real(fz_atof(buf));
  242. fz_array_push(ary, obj);
  243. fz_drop_obj(obj);
  244. break;
  245. case PDF_TOK_STRING:
  246. obj = fz_new_string(buf, len);
  247. fz_array_push(ary, obj);
  248. fz_drop_obj(obj);
  249. break;
  250. case PDF_TOK_TRUE:
  251. obj = fz_new_bool(1);
  252. fz_array_push(ary, obj);
  253. fz_drop_obj(obj);
  254. break;
  255. case PDF_TOK_FALSE:
  256. obj = fz_new_bool(0);
  257. fz_array_push(ary, obj);
  258. fz_drop_obj(obj);
  259. break;
  260. case PDF_TOK_NULL:
  261. obj = fz_new_null();
  262. fz_array_push(ary, obj);
  263. fz_drop_obj(obj);
  264. break;
  265. default:
  266. fz_drop_obj(ary);
  267. return fz_throw("cannot parse token in array");
  268. }
  269. }
  270. }
  271. fz_error
  272. pdf_parse_dict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
  273. {
  274. fz_error error = fz_okay;
  275. fz_obj *dict = NULL;
  276. fz_obj *key = NULL;
  277. fz_obj *val = NULL;
  278. int tok;
  279. int len;
  280. int a, b;
  281. dict = fz_new_dict(8);
  282. while (1)
  283. {
  284. error = pdf_lex(&tok, file, buf, cap, &len);
  285. if (error)
  286. {
  287. fz_drop_obj(dict);
  288. return fz_rethrow(error, "cannot parse dict");
  289. }
  290. skip:
  291. if (tok == PDF_TOK_CLOSE_DICT)
  292. {
  293. *op = dict;
  294. return fz_okay;
  295. }
  296. /* for BI .. ID .. EI in content streams */
  297. if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))
  298. {
  299. *op = dict;
  300. return fz_okay;
  301. }
  302. if (tok != PDF_TOK_NAME)
  303. {
  304. fz_drop_obj(dict);
  305. return fz_throw("invalid key in dict");
  306. }
  307. key = fz_new_name(buf);
  308. error = pdf_lex(&tok, file, buf, cap, &len);
  309. if (error)
  310. {
  311. fz_drop_obj(key);
  312. fz_drop_obj(dict);
  313. return fz_rethrow(error, "cannot parse dict");
  314. }
  315. switch (tok)
  316. {
  317. case PDF_TOK_OPEN_ARRAY:
  318. error = pdf_parse_array(&val, xref, file, buf, cap);
  319. if (error)
  320. {
  321. fz_drop_obj(key);
  322. fz_drop_obj(dict);
  323. return fz_rethrow(error, "cannot parse dict");
  324. }
  325. break;
  326. case PDF_TOK_OPEN_DICT:
  327. error = pdf_parse_dict(&val, xref, file, buf, cap);
  328. if (error)
  329. {
  330. fz_drop_obj(key);
  331. fz_drop_obj(dict);
  332. return fz_rethrow(error, "cannot parse dict");
  333. }
  334. break;
  335. case PDF_TOK_NAME: val = fz_new_name(buf); break;
  336. case PDF_TOK_REAL: val = fz_new_real(fz_atof(buf)); break;
  337. case PDF_TOK_STRING: val = fz_new_string(buf, len); break;
  338. case PDF_TOK_TRUE: val = fz_new_bool(1); break;
  339. case PDF_TOK_FALSE: val = fz_new_bool(0); break;
  340. case PDF_TOK_NULL: val = fz_new_null(); break;
  341. case PDF_TOK_INT:
  342. /* 64-bit to allow for numbers > INT_MAX and overflow */
  343. a = (int) strtoll(buf, 0, 10);
  344. error = pdf_lex(&tok, file, buf, cap, &len);
  345. if (error)
  346. {
  347. fz_drop_obj(key);
  348. fz_drop_obj(dict);
  349. return fz_rethrow(error, "cannot parse dict");
  350. }
  351. if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
  352. (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")))
  353. {
  354. val = fz_new_int(a);
  355. fz_dict_put(dict, key, val);
  356. fz_drop_obj(val);
  357. fz_drop_obj(key);
  358. goto skip;
  359. }
  360. if (tok == PDF_TOK_INT)
  361. {
  362. b = atoi(buf);
  363. error = pdf_lex(&tok, file, buf, cap, &len);
  364. if (error)
  365. {
  366. fz_drop_obj(key);
  367. fz_drop_obj(dict);
  368. return fz_rethrow(error, "cannot parse dict");
  369. }
  370. if (tok == PDF_TOK_R)
  371. {
  372. val = fz_new_indirect(a, b, xref);
  373. break;
  374. }
  375. }
  376. fz_drop_obj(key);
  377. fz_drop_obj(dict);
  378. return fz_throw("invalid indirect reference in dict");
  379. default:
  380. fz_drop_obj(key);
  381. fz_drop_obj(dict);
  382. return fz_throw("unknown token in dict");
  383. }
  384. fz_dict_put(dict, key, val);
  385. fz_drop_obj(val);
  386. fz_drop_obj(key);
  387. }
  388. }
  389. fz_error
  390. pdf_parse_stm_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
  391. {
  392. fz_error error;
  393. int tok;
  394. int len;
  395. error = pdf_lex(&tok, file, buf, cap, &len);
  396. if (error)
  397. return fz_rethrow(error, "cannot parse token in object stream");
  398. switch (tok)
  399. {
  400. case PDF_TOK_OPEN_ARRAY:
  401. error = pdf_parse_array(op, xref, file, buf, cap);
  402. if (error)
  403. return fz_rethrow(error, "cannot parse object stream");
  404. break;
  405. case PDF_TOK_OPEN_DICT:
  406. error = pdf_parse_dict(op, xref, file, buf, cap);
  407. if (error)
  408. return fz_rethrow(error, "cannot parse object stream");
  409. break;
  410. case PDF_TOK_NAME: *op = fz_new_name(buf); break;
  411. case PDF_TOK_REAL: *op = fz_new_real(fz_atof(buf)); break;
  412. case PDF_TOK_STRING: *op = fz_new_string(buf, len); break;
  413. case PDF_TOK_TRUE: *op = fz_new_bool(1); break;
  414. case PDF_TOK_FALSE: *op = fz_new_bool(0); break;
  415. case PDF_TOK_NULL: *op = fz_new_null(); break;
  416. case PDF_TOK_INT: *op = fz_new_int(atoi(buf)); break;
  417. default: return fz_throw("unknown token in object stream");
  418. }
  419. return fz_okay;
  420. }
  421. fz_error
  422. pdf_parse_ind_obj(fz_obj **op, pdf_xref *xref,
  423. fz_stream *file, char *buf, int cap,
  424. int *onum, int *ogen, int *ostmofs)
  425. {
  426. fz_error error = fz_okay;
  427. fz_obj *obj = NULL;
  428. int num = 0, gen = 0, stm_ofs;
  429. int tok;
  430. int len;
  431. int a, b;
  432. error = pdf_lex(&tok, file, buf, cap, &len);
  433. if (error)
  434. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  435. if (tok != PDF_TOK_INT)
  436. return fz_throw("expected object number (%d %d R)", num, gen);
  437. num = atoi(buf);
  438. error = pdf_lex(&tok, file, buf, cap, &len);
  439. if (error)
  440. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  441. if (tok != PDF_TOK_INT)
  442. return fz_throw("expected generation number (%d %d R)", num, gen);
  443. gen = atoi(buf);
  444. error = pdf_lex(&tok, file, buf, cap, &len);
  445. if (error)
  446. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  447. if (tok != PDF_TOK_OBJ)
  448. return fz_throw("expected 'obj' keyword (%d %d R)", num, gen);
  449. error = pdf_lex(&tok, file, buf, cap, &len);
  450. if (error)
  451. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  452. switch (tok)
  453. {
  454. case PDF_TOK_OPEN_ARRAY:
  455. error = pdf_parse_array(&obj, xref, file, buf, cap);
  456. if (error)
  457. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  458. break;
  459. case PDF_TOK_OPEN_DICT:
  460. error = pdf_parse_dict(&obj, xref, file, buf, cap);
  461. if (error)
  462. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  463. break;
  464. case PDF_TOK_NAME: obj = fz_new_name(buf); break;
  465. case PDF_TOK_REAL: obj = fz_new_real(fz_atof(buf)); break;
  466. case PDF_TOK_STRING: obj = fz_new_string(buf, len); break;
  467. case PDF_TOK_TRUE: obj = fz_new_bool(1); break;
  468. case PDF_TOK_FALSE: obj = fz_new_bool(0); break;
  469. case PDF_TOK_NULL: obj = fz_new_null(); break;
  470. case PDF_TOK_INT:
  471. a = atoi(buf);
  472. error = pdf_lex(&tok, file, buf, cap, &len);
  473. if (error)
  474. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  475. if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
  476. {
  477. obj = fz_new_int(a);
  478. goto skip;
  479. }
  480. if (tok == PDF_TOK_INT)
  481. {
  482. b = atoi(buf);
  483. error = pdf_lex(&tok, file, buf, cap, &len);
  484. if (error)
  485. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  486. if (tok == PDF_TOK_R)
  487. {
  488. obj = fz_new_indirect(a, b, xref);
  489. break;
  490. }
  491. }
  492. return fz_throw("expected 'R' keyword (%d %d R)", num, gen);
  493. case PDF_TOK_ENDOBJ:
  494. obj = fz_new_null();
  495. goto skip;
  496. default:
  497. return fz_throw("syntax error in object (%d %d R)", num, gen);
  498. }
  499. error = pdf_lex(&tok, file, buf, cap, &len);
  500. if (error)
  501. {
  502. fz_drop_obj(obj);
  503. return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
  504. }
  505. skip:
  506. if (tok == PDF_TOK_STREAM)
  507. {
  508. int c = fz_read_byte(file);
  509. while (c == ' ')
  510. c = fz_read_byte(file);
  511. if (c == '\r')
  512. {
  513. c = fz_peek_byte(file);
  514. if (c != '\n')
  515. fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen);
  516. else
  517. fz_read_byte(file);
  518. }
  519. stm_ofs = fz_tell(file);
  520. }
  521. else if (tok == PDF_TOK_ENDOBJ)
  522. {
  523. stm_ofs = 0;
  524. }
  525. else
  526. {
  527. fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
  528. stm_ofs = 0;
  529. }
  530. if (onum) *onum = num;
  531. if (ogen) *ogen = gen;
  532. if (ostmofs) *ostmofs = stm_ofs;
  533. *op = obj;
  534. return fz_okay;
  535. }