PageRenderTime 61ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/epstool-3.08/src/cpdfscan.c

#
C | 1885 lines | 1581 code | 154 blank | 150 comment | 502 complexity | 0f28b1229799617f443909ec4b52fe19 MD5 | raw file
Possible License(s): GPL-2.0
  1. /* Copyright (C) 2002-2005 Ghostgum Software Pty Ltd. All rights reserved.
  2. This software is provided AS-IS with no warranty, either express or
  3. implied.
  4. This software is distributed under licence and may not be copied,
  5. modified or distributed except as expressly authorised under the terms
  6. of the licence contained in the file LICENCE in this distribution.
  7. For more information about licensing, please refer to
  8. http://www.ghostgum.com.au/ or contact Ghostsgum Software Pty Ltd,
  9. 218 Gallaghers Rd, Glen Waverley VIC 3150, AUSTRALIA,
  10. Fax +61 3 9886 6616.
  11. */
  12. /* $Id: cpdfscan.c,v 1.7 2005/06/10 09:39:24 ghostgum Exp $ */
  13. /* PDF scanner */
  14. /* This is a rudimentary PDF scanner, intended to get
  15. * the page count, and for each page the Rotate, MediaBox
  16. * and CropBox.
  17. */
  18. #ifdef DEMO_PDFSCAN
  19. # include <windows.h>
  20. # include <stdio.h>
  21. # include <stdarg.h>
  22. # include <string.h>
  23. # include <ctype.h>
  24. # ifdef _MSC_VER
  25. # define vsnprintf _vsnprintf
  26. # endif
  27. # define csfopen fopen
  28. # define cslen strlen
  29. #else
  30. # include "common.h"
  31. # include <ctype.h>
  32. #endif
  33. #include "cpdfscan.h"
  34. /* Limitations.
  35. *
  36. * We currently load the entire xref table. To minimise memory
  37. * would could instead keep a list of xref blocks, and do random
  38. * access within those.
  39. *
  40. * Memory management is very simple. We just keep a linked
  41. * list of allocated blocks for composite objects.
  42. * We empty the stack, and free all PDF objects and composite
  43. * objects before returning to the caller.
  44. * We don't bother doing garbage collection.
  45. */
  46. /* We keep a linked list of memory allocated for composite objects
  47. * such as name, string, array or dict.
  48. */
  49. typedef struct PDFMEM_s PDFMEM;
  50. struct PDFMEM_s {
  51. void *ptr;
  52. int len;
  53. PDFMEM *next;
  54. };
  55. /* The token scanner and object references understand the following types */
  56. typedef enum rtype_e {
  57. invalidtype=0,
  58. marktype=1,
  59. nulltype=2,
  60. booltype=3, /* uses boolval */
  61. integertype=4, /* uses intval */
  62. realtype=5, /* uses realval */
  63. nametype=6, /* uses nameval */
  64. stringtype=7, /* uses strval */
  65. arraytype=8, /* uses arrayval */
  66. dicttype=9, /* uses dictval */
  67. optype=10, /* uses opval */
  68. streamtype=11, /* uses streamval */
  69. objtype=12, /* uses objval */
  70. commenttype=13
  71. } rtype;
  72. const char *rtype_string[] = {
  73. "invalidtype", "marktype", "nulltype", "booltype", "integertype",
  74. "realtype", "nametype", "stringtype", "arraytype", "dicttype",
  75. "optype", "streamtype", "objtype", "commenttype"
  76. };
  77. /* A reference contains a simple object, or a pointer to
  78. * a composite object.
  79. */
  80. typedef struct ref_s ref;
  81. struct ref_s {
  82. rtype type;
  83. int rsize;
  84. union value_u {
  85. /* simple */
  86. void *voidval;
  87. BOOL boolval;
  88. int intval;
  89. float realval;
  90. /* composite */
  91. char *nameval;
  92. char *strval;
  93. ref *arrayval;
  94. ref *dictval;
  95. char *opval;
  96. /* simple */
  97. unsigned long streamval;
  98. int objval;
  99. } value;
  100. };
  101. /* Cross reference table entry */
  102. typedef struct PDFXREF_s {
  103. unsigned long offset;
  104. int generation;
  105. BOOL used;
  106. } PDFXREF;
  107. struct PDFSCAN_s {
  108. void *handle;
  109. int (*print_fn)(void *handle, const char *ptr, int len);
  110. TCHAR filename[1024];
  111. FILE *file;
  112. char *buf;
  113. int buflen; /* length of allocated buf */
  114. int len; /* #bytes currently in buf */
  115. int offset; /* file offset to start of buf */
  116. int begin; /* offset in buf to start of token */
  117. int end; /* offset in buf to end of token */
  118. rtype token_type; /* token type */
  119. BOOL instream; /* In a stream, looking for endstream */
  120. unsigned long xref_offset; /* offset to xref table */
  121. PDFXREF *xref;
  122. int xref_len;
  123. /* Object numbers obtained during pdf_scan_open() */
  124. int root; /* root object reference */
  125. int info; /* document info dicionary reference */
  126. int pages; /* Pages dictionary reference */
  127. int page_count; /* number of pages */
  128. /* Cached page media */
  129. int pagenum;
  130. int rotate;
  131. PDFBBOX mediabox;
  132. PDFBBOX cropbox;
  133. /* memory allocation */
  134. PDFMEM *memory_head;
  135. PDFMEM *memory_tail;
  136. /* operand stack */
  137. ref *ostack;
  138. int ostack_idx; /* index to top of ostack */
  139. int ostack_len; /* Initially 512 */
  140. int ostack_maxlen; /* maximum depth of ostack */
  141. /* objects in memory */
  142. /* This contains pairs of integer & reference */
  143. ref *objs;
  144. int objs_count; /* count of loaded objects */
  145. int objs_len; /* length of objs */
  146. int objs_maxlen; /* maximum number entries in objs */
  147. };
  148. typedef enum PDFSEEK_e {
  149. PDFSEEK_CUR,
  150. PDFSEEK_END,
  151. PDFSEEK_SET
  152. } PDFSEEK;
  153. /* Prototypes */
  154. static int pdf_scan_next_token(PDFSCAN *ps);
  155. static int pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev);
  156. static int pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset);
  157. static void clear_stack(PDFSCAN *ps);
  158. static void clear_objs(PDFSCAN *ps);
  159. static void pdf_scan_freeall(PDFSCAN *ps);
  160. static void pdf_scan_cleanup(PDFSCAN *ps);
  161. static int pdf_scan_open_file(PDFSCAN *ps);
  162. /*****************************************************************/
  163. /* text message output */
  164. static int
  165. pdf_scan_write(PDFSCAN *ps, const char *str, int len)
  166. {
  167. if (ps != NULL)
  168. fwrite(str, 1, len, stdout);
  169. else
  170. (*ps->print_fn)(ps->handle, str, len);
  171. return len;
  172. }
  173. static int
  174. pdf_scan_msgf(PDFSCAN *ps, const char *fmt, ...)
  175. {
  176. va_list args;
  177. int count;
  178. char buf[2048];
  179. va_start(args,fmt);
  180. count = vsnprintf(buf, sizeof(buf), fmt, args);
  181. pdf_scan_write(ps, buf, count);
  182. va_end(args);
  183. return count;
  184. }
  185. /*****************************************************************/
  186. /* memory allocation */
  187. static void
  188. pdf_scan_cleanup(PDFSCAN *ps)
  189. {
  190. if (ps->file)
  191. fclose(ps->file);
  192. ps->file = NULL;
  193. clear_stack(ps);
  194. clear_objs(ps);
  195. pdf_scan_freeall(ps);
  196. }
  197. static void *pdf_scan_alloc(PDFSCAN *ps, const void *ptr, int len)
  198. {
  199. void *data;
  200. PDFMEM *mem = (PDFMEM *)malloc(sizeof(PDFMEM));
  201. if (mem == NULL)
  202. return NULL;
  203. data = malloc(len);
  204. if (data == NULL) {
  205. free(mem);
  206. return NULL;
  207. }
  208. mem->ptr = data;
  209. mem->next = NULL;
  210. mem->len = len;
  211. memcpy(data, ptr, len);
  212. if (ps->memory_tail) {
  213. ps->memory_tail->next = mem;
  214. ps->memory_tail = mem;
  215. }
  216. else
  217. ps->memory_head = ps->memory_tail = mem;
  218. return data;
  219. }
  220. /* free all name/string/array/dict memory */
  221. static void
  222. pdf_scan_freeall(PDFSCAN *ps)
  223. {
  224. PDFMEM *memnext;
  225. PDFMEM *mem = ps->memory_head;
  226. while (mem) {
  227. memnext = mem->next;
  228. free(mem->ptr);
  229. free(mem);
  230. mem = memnext;
  231. }
  232. ps->memory_head = ps->memory_tail = NULL;
  233. }
  234. /*****************************************************************/
  235. /* Token checks */
  236. static BOOL is_optoken(PDFSCAN *ps, const char *str)
  237. {
  238. return (ps->token_type == optype) &&
  239. (ps->end-ps->begin == (int)strlen(str)) &&
  240. (memcmp(ps->buf+ps->begin, str, ps->end-ps->begin) == 0);
  241. }
  242. static int
  243. type_check(PDFSCAN *ps, rtype type)
  244. {
  245. if (ps->token_type == type)
  246. return 0;
  247. pdf_scan_msgf(ps, "Error at offset %ld. Expecting %s and found %s\n",
  248. ps->offset + ps->begin,
  249. rtype_string[(int)type],
  250. rtype_string[(int)ps->token_type]);
  251. pdf_scan_msgf(ps, "Token is \042");
  252. pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  253. pdf_scan_msgf(ps, "\042\n");
  254. return -1;
  255. }
  256. static int
  257. op_check(PDFSCAN *ps, const char *str)
  258. {
  259. int code = type_check(ps, optype);
  260. if (code)
  261. return code;
  262. if (!is_optoken(ps, str)) {
  263. pdf_scan_msgf(ps,
  264. "Error at offset %ld. Expecting \042%s\042 and found \042",
  265. ps->offset + ps->begin, str);
  266. pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  267. pdf_scan_msgf(ps, "\042\n");
  268. code = -1;
  269. }
  270. return code;
  271. }
  272. /*****************************************************************/
  273. /* stack */
  274. const ref invalidref = {invalidtype, 0, {NULL}};
  275. const ref markref = {marktype, 0, {NULL}};
  276. /* Push item, return depth of stack */
  277. /* >0 is success, <=0 is failure */
  278. static int push_stack(PDFSCAN *ps, ref r)
  279. {
  280. int idx;
  281. if (ps->ostack_idx + 1 >= ps->ostack_len) {
  282. /* increase stack size */
  283. ref *newstack;
  284. int newlen = ps->ostack_len + 256;
  285. if (newlen > ps->ostack_maxlen) {
  286. pdf_scan_msgf(ps, "push_stack: stack overflow\n");
  287. return 0;
  288. }
  289. newstack = (ref *)malloc(newlen * sizeof(ref));
  290. if (newstack == NULL) {
  291. pdf_scan_msgf(ps, "push_stack: Out of memory\n");
  292. return 0;
  293. }
  294. memcpy(newstack, ps->ostack, ps->ostack_len * sizeof(ref));
  295. free(ps->ostack);
  296. ps->ostack = newstack;
  297. ps->ostack_len = newlen;
  298. }
  299. idx = ++(ps->ostack_idx);
  300. ps->ostack[idx] = r;
  301. return idx;
  302. }
  303. static ref pop_stack(PDFSCAN *ps)
  304. {
  305. if (ps->ostack_idx <= 0) {
  306. pdf_scan_msgf(ps, "pop_stack: stack underflow\n");
  307. return invalidref;
  308. }
  309. return ps->ostack[ps->ostack_idx--];
  310. }
  311. static void clear_stack(PDFSCAN *ps)
  312. {
  313. ps->ostack_idx = 0;
  314. }
  315. static ref index_stack(PDFSCAN *ps, int n)
  316. {
  317. if (n < 0) {
  318. pdf_scan_msgf(ps, "index_stack: index must not be negative\n");
  319. return invalidref;
  320. }
  321. if (ps->ostack_idx <= n) {
  322. pdf_scan_msgf(ps, "index_stack: stack isn't that deep\n");
  323. return invalidref;
  324. }
  325. return ps->ostack[ps->ostack_idx-n];
  326. }
  327. static ref top_stack(PDFSCAN *ps)
  328. {
  329. if (ps->ostack_idx <= 0) {
  330. pdf_scan_msgf(ps, "top_stack: stack is empty\n");
  331. return invalidref;
  332. }
  333. return ps->ostack[ps->ostack_idx];
  334. }
  335. /*****************************************************************/
  336. /* references */
  337. static ref make_int(int value)
  338. {
  339. ref r;
  340. r.type = integertype;
  341. r.rsize = 0;
  342. r.value.intval = value;
  343. return r;
  344. }
  345. static ref make_string(PDFSCAN *ps, const char *str, int len)
  346. {
  347. ref r;
  348. r.type = stringtype;
  349. r.rsize = len;
  350. r.value.strval = pdf_scan_alloc(ps, str, len);
  351. if (r.value.strval == NULL)
  352. return invalidref;
  353. return r;
  354. }
  355. static ref make_name(PDFSCAN *ps, const char *str, int len)
  356. {
  357. ref r;
  358. r.type = nametype;
  359. r.rsize = len;
  360. r.value.nameval = pdf_scan_alloc(ps, str, len);
  361. if (r.value.nameval == NULL)
  362. return invalidref;
  363. return r;
  364. }
  365. static BOOL nameref_equals(ref *r, const char *name)
  366. {
  367. int len = (int)strlen(name);
  368. if (r->type != nametype)
  369. return FALSE;
  370. if (r->rsize != len)
  371. return FALSE;
  372. return (memcmp(r->value.nameval, name, len) == 0);
  373. }
  374. /* Get a reference from a dictionary */
  375. /* Return the result, but don't push it */
  376. static ref dict_get(PDFSCAN *ps, const char *name)
  377. {
  378. int namelen = (int)strlen(name);
  379. ref dict = top_stack(ps);
  380. ref *r;
  381. int dictlen;
  382. int i;
  383. if (dict.type == invalidtype)
  384. return invalidref;
  385. dictlen = dict.rsize * 2;
  386. for (i = 0; i<dictlen; i+=2) {
  387. r = &dict.value.dictval[i];
  388. if ((r->rsize == namelen) && (r->type == nametype) &&
  389. (memcmp(r->value.nameval, name, namelen) ==0))
  390. return dict.value.dictval[i+1];
  391. }
  392. return invalidref;
  393. }
  394. /* convert the items on the stack to an array on the stack */
  395. static ref array_to_mark(PDFSCAN *ps)
  396. {
  397. ref r;
  398. ref *array;
  399. int n = ps->ostack_idx;
  400. int len;
  401. while ((n>0) && (ps->ostack[n].type != marktype))
  402. n--;
  403. if (n == 0) {
  404. pdf_scan_msgf(ps, "array_to_mark: no mark on stack\n");
  405. return invalidref;
  406. }
  407. len = ps->ostack_idx - n;
  408. r.type = arraytype;
  409. r.rsize = len;
  410. r.value.arrayval = NULL;
  411. if (len) {
  412. array = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
  413. if (array)
  414. r.value.arrayval = array;
  415. else
  416. return invalidref;
  417. }
  418. ps->ostack_idx -= len + 1;
  419. push_stack(ps, r);
  420. return r;
  421. }
  422. /* convert the items on the stack to a dictionary on the stack */
  423. static ref dict_to_mark(PDFSCAN *ps)
  424. {
  425. ref r;
  426. ref *dict;
  427. int n = ps->ostack_idx;
  428. int len;
  429. while ((n>0) && (ps->ostack[n].type != marktype))
  430. n--;
  431. if (n == 0) {
  432. pdf_scan_msgf(ps, "dict_to_mark: no mark on stack\n");
  433. return invalidref;
  434. }
  435. len = ps->ostack_idx - n;
  436. if (len & 1) {
  437. pdf_scan_msgf(ps, "dict_to_mark: must have name/value pairs\n");
  438. return invalidref;
  439. }
  440. r.type = dicttype;
  441. r.rsize = len/2;
  442. r.value.arrayval = NULL;
  443. if (len) {
  444. dict = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
  445. if (dict)
  446. r.value.arrayval = dict;
  447. else
  448. return invalidref;
  449. }
  450. ps->ostack_idx -= len + 1;
  451. push_stack(ps, r);
  452. return r;
  453. }
  454. /*****************************************************************/
  455. /* Push reference from a token */
  456. static ref push_token(PDFSCAN *ps)
  457. {
  458. ref r;
  459. int len = ps->end - ps->begin;
  460. const char *p = ps->buf + ps->begin;
  461. r.type = ps->token_type;
  462. r.rsize = 0;
  463. r.value.voidval = NULL;
  464. switch(r.type) {
  465. case invalidtype:
  466. break;
  467. case marktype:
  468. break;
  469. case nulltype:
  470. break;
  471. case booltype:
  472. if ((len == 4) && (memcmp(p, "true", 4)==0))
  473. r.value.boolval = TRUE;
  474. else if ((len == 5) && (memcmp(p, "true", 5)==0))
  475. r.value.boolval = FALSE;
  476. else
  477. r = invalidref;
  478. break;
  479. case integertype:
  480. { char buf[64];
  481. if (len > (int)sizeof(buf)-1)
  482. r = invalidref;
  483. else {
  484. memcpy(buf, p, len);
  485. buf[len] = '\0';
  486. r.value.intval = atoi(buf);
  487. }
  488. }
  489. break;
  490. case realtype:
  491. { char buf[64];
  492. if (len > (int)sizeof(buf)-1)
  493. r = invalidref;
  494. else {
  495. memcpy(buf, p, len);
  496. buf[len] = '\0';
  497. r.value.realval = (float)atof(buf);
  498. }
  499. }
  500. break;
  501. case nametype:
  502. r = make_name(ps, p+1, len-1);
  503. break;
  504. case stringtype:
  505. r = make_string(ps, p, len);
  506. break;
  507. case streamtype:
  508. case commenttype:
  509. case objtype:
  510. case optype:
  511. case arraytype:
  512. case dicttype:
  513. /* Can't push these from a token */
  514. /* These are made by operators like stream, R, ], >> */
  515. return invalidref;
  516. default:
  517. r.type = invalidtype;
  518. break;
  519. }
  520. push_stack(ps, r);
  521. return r;
  522. }
  523. /* Process known operators */
  524. static int process_op(PDFSCAN *ps)
  525. {
  526. ref r;
  527. if (ps->token_type != optype)
  528. return 1; /* not an op */
  529. if (is_optoken(ps, "R")) {
  530. /* convert "n 0 R" to an indirect reference */
  531. ref r1 = index_stack(ps, 1);
  532. r = top_stack(ps);
  533. if ((r.type == integertype) && (r1.type == integertype)) {
  534. r.type = objtype;
  535. r.rsize = r.value.intval;
  536. r.value.intval = r1.value.intval;
  537. pop_stack(ps);
  538. pop_stack(ps);
  539. push_stack(ps, r);
  540. }
  541. }
  542. else if (is_optoken(ps, "]")) {
  543. array_to_mark(ps);
  544. }
  545. else if (is_optoken(ps, ">>")) {
  546. dict_to_mark(ps);
  547. }
  548. else if (is_optoken(ps, "null")) {
  549. r.type = nulltype;
  550. r.rsize = 0;
  551. r.value.voidval = NULL;
  552. push_stack(ps, r);
  553. }
  554. else if (is_optoken(ps, "obj")) {
  555. pdf_scan_msgf(ps, "ignoring obj token\n");
  556. /* ignore */
  557. }
  558. else if (is_optoken(ps, "endobj")) {
  559. pdf_scan_msgf(ps, "ignoring endobj token\n");
  560. /* ignore */
  561. }
  562. else if (is_optoken(ps, "stream")) {
  563. /* stream object contains offset to start of stream */
  564. r.type = streamtype;
  565. r.rsize = 0;
  566. r.value.streamval = ps->offset + ps->end;
  567. push_stack(ps, r);
  568. /* Now skip over stream */
  569. pdf_scan_next_token(ps);
  570. }
  571. else {
  572. pdf_scan_msgf(ps, "process_op: unrecognised operator \042");
  573. pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
  574. pdf_scan_msgf(ps, "\042\n");
  575. return -1;
  576. }
  577. return 0;
  578. }
  579. /*****************************************************************/
  580. /* Debugging and error messages */
  581. #ifdef NOTUSED
  582. /* Print a reference, returning number of characters written */
  583. static int
  584. print_ref(PDFSCAN *ps, ref *r)
  585. {
  586. int n = 0;
  587. switch(r->type) {
  588. case invalidtype:
  589. n = pdf_scan_msgf(ps, "--invalid--");
  590. break;
  591. case marktype:
  592. n = pdf_scan_msgf(ps, "--mark--");
  593. break;
  594. case nulltype:
  595. n = pdf_scan_msgf(ps, "--null--");
  596. break;
  597. case booltype:
  598. n = pdf_scan_msgf(ps, "%s", r->value.boolval ? "true" : "false");
  599. break;
  600. case integertype:
  601. n = pdf_scan_msgf(ps, "%d", r->value.intval);
  602. break;
  603. case realtype:
  604. n = pdf_scan_msgf(ps, "%g", r->value.realval);
  605. break;
  606. case nametype:
  607. n = pdf_scan_write(ps, "/", 1);
  608. pdf_scan_write(ps, r->value.nameval, r->rsize);
  609. break;
  610. case stringtype:
  611. n = pdf_scan_write(ps, "(", 1);
  612. n += pdf_scan_write(ps, r->value.strval, r->rsize);
  613. n += pdf_scan_write(ps, ")", 1);
  614. break;
  615. case streamtype:
  616. n = pdf_scan_msgf(ps, "--stream:%d--", r->value.streamval);
  617. break;
  618. case commenttype:
  619. n = pdf_scan_msgf(ps, "--comment--");
  620. break;
  621. case objtype:
  622. n = pdf_scan_msgf(ps, "--obj:%d--", r->value.objval);
  623. break;
  624. case optype:
  625. n = pdf_scan_msgf(ps, "--op:");
  626. n += pdf_scan_write(ps, r->value.opval, r->rsize);
  627. n += pdf_scan_write(ps, "--", 2);
  628. break;
  629. case arraytype:
  630. n = pdf_scan_msgf(ps, "--array:%d--", r->rsize);
  631. break;
  632. case dicttype:
  633. n = pdf_scan_msgf(ps, "--dict:%d--", r->rsize);
  634. break;
  635. default:
  636. n = pdf_scan_msgf(ps, "--unknown--");
  637. break;
  638. }
  639. return n;
  640. }
  641. /* print a reference, expanding array and dict */
  642. static int
  643. print_ref_expand(PDFSCAN *ps, ref *r)
  644. {
  645. int i;
  646. int n = 0;;
  647. if (r->type == arraytype) {
  648. n += pdf_scan_msgf(ps, "[ ");
  649. for (i=0; i<r->rsize; i++) {
  650. n += print_ref(ps, &r->value.arrayval[i]);
  651. n += pdf_scan_msgf(ps, " ");
  652. }
  653. n += pdf_scan_msgf(ps, "]");
  654. }
  655. else if (r->type == dicttype) {
  656. n += pdf_scan_msgf(ps, "<< ");
  657. for (i=0; i<r->rsize; i++) {
  658. n += print_ref(ps, &r->value.dictval[i+i]);
  659. n += pdf_scan_msgf(ps, " ");
  660. n += print_ref(ps, &r->value.dictval[i+i+1]);
  661. n += pdf_scan_msgf(ps, " ");
  662. }
  663. n += pdf_scan_msgf(ps, ">>");
  664. }
  665. else
  666. n += print_ref(ps, r);
  667. return n;
  668. }
  669. static void
  670. print_stack(PDFSCAN *ps)
  671. {
  672. int i, n=ps->ostack_idx;
  673. int col = 0;
  674. pdf_scan_msgf(ps, "Stack: ");
  675. for (i=1; i<=n; i++) {
  676. col += print_ref(ps, &ps->ostack[i]);
  677. if (col > 70) {
  678. pdf_scan_msgf(ps, "\n");
  679. col = 0;
  680. }
  681. else
  682. col += pdf_scan_msgf(ps, " ");
  683. }
  684. pdf_scan_msgf(ps, "\n");
  685. }
  686. static void
  687. print_stack_expand(PDFSCAN *ps)
  688. {
  689. int i, n=ps->ostack_idx;
  690. pdf_scan_msgf(ps, "Stack:\n");
  691. for (i=1; i<=n; i++) {
  692. pdf_scan_msgf(ps, "%2d: ", i);
  693. print_ref_expand(ps, &ps->ostack[i]);
  694. pdf_scan_msgf(ps, "\n");
  695. }
  696. }
  697. static void pdf_scan_print_allocated(PDFSCAN *ps)
  698. {
  699. int count = 0;
  700. int len = 0;
  701. PDFMEM *mem = ps->memory_head;
  702. while (mem) {
  703. len += sizeof(PDFMEM);
  704. len += mem->len;
  705. count++;
  706. mem = mem->next;
  707. }
  708. pdf_scan_msgf(ps, "Allocated memory %d bytes in %d objects\n",
  709. len, count);
  710. }
  711. #endif
  712. /*****************************************************************/
  713. /* object reading and cache */
  714. static int obj_add(PDFSCAN *ps, int objnum, ref objref)
  715. {
  716. if (ps->objs_count + 2 >= ps->objs_len) {
  717. /* allocate more space */
  718. ref *newobjs;
  719. int newlen = ps->objs_len + 256;
  720. if (newlen > ps->objs_maxlen) {
  721. pdf_scan_msgf(ps, "obj_add: too many objects to cache\n");
  722. return 0;
  723. }
  724. newobjs = (ref *)malloc(newlen * sizeof(ref));
  725. if (newobjs == NULL) {
  726. pdf_scan_msgf(ps, "obj_add: Out of memory\n");
  727. return 0;
  728. }
  729. memcpy(newobjs, ps->objs, ps->objs_len * sizeof(ref));
  730. free(ps->objs);
  731. ps->objs = newobjs;
  732. ps->objs_len = newlen;
  733. }
  734. ps->objs[ps->objs_count++] = make_int(objnum);
  735. ps->objs[ps->objs_count++] = objref;
  736. return ps->objs_count;
  737. }
  738. static ref obj_find(PDFSCAN *ps, int objnum)
  739. {
  740. int i;
  741. for (i=0; i<ps->objs_count; i+=2) {
  742. if (objnum == ps->objs[i].value.intval)
  743. return ps->objs[i+1];
  744. }
  745. return invalidref;
  746. }
  747. static void clear_objs(PDFSCAN *ps)
  748. {
  749. ps->objs_count = 0;
  750. }
  751. /*****************************************************************/
  752. /* token parsing */
  753. static int is_white(char ch)
  754. {
  755. return (ch == '\0') || (ch == '\t') || (ch == '\n') ||
  756. (ch == '\f') || (ch == '\r') || (ch == ' ');
  757. }
  758. static int is_delimiter(char ch)
  759. {
  760. return (ch == '(') || (ch == ')') ||
  761. (ch == '<') || (ch == '>') ||
  762. (ch == '[') || (ch == ']') ||
  763. (ch == '{') || (ch == '}') ||
  764. (ch == '/') || (ch == '%');
  765. }
  766. /* Scan next token from buffer, returning token type and offset to begin
  767. * and end of token.
  768. * Return 0 if OK, 1 if no token or not enough data, -1 on error
  769. */
  770. static int pdf_scan_token(const char *buf, int buflen,
  771. rtype *ttype, int *tbegin, int *tend)
  772. {
  773. int code = -1;
  774. int i = 0;
  775. rtype type;
  776. int begin, end;
  777. *ttype = type = invalidtype;
  778. *tbegin = begin = 0;
  779. *tend = end = 0;
  780. while ((i < buflen) && is_white(buf[i]))
  781. i++;
  782. if (i == buflen)
  783. return 1;
  784. begin = i;
  785. if (buf[i] == '%') {
  786. while (i < buflen) {
  787. if ((buf[i] == '\n') || (buf[i] == '\r')) {
  788. type = commenttype;
  789. end = i;
  790. code = 0;
  791. break;
  792. }
  793. i++;
  794. }
  795. if (i >= buflen)
  796. code = 1;
  797. }
  798. else if (buf[i] == '(') {
  799. /* string */
  800. int pcount = 0;
  801. type = stringtype;
  802. i++;
  803. while (i < buflen) {
  804. if (buf[i] == '\\')
  805. i++;
  806. else if (buf[i] == '(')
  807. pcount++;
  808. else if (buf[i] == ')') {
  809. if (pcount <= 0) {
  810. end = i+1;
  811. code = 0;
  812. break;
  813. }
  814. else
  815. pcount--;
  816. }
  817. i++;
  818. }
  819. if (i >= buflen)
  820. code = 1;
  821. }
  822. else if (buf[i] == '<') {
  823. i++;
  824. if (i >= buflen) {
  825. code = 1;
  826. }
  827. else if (buf[i] == '<') {
  828. /* marktype */
  829. end = i+1;
  830. type = marktype;
  831. code = 0;
  832. }
  833. else {
  834. /* hexadecimal string */
  835. type = stringtype;
  836. while (i < buflen) {
  837. if (buf[i] == '>') {
  838. end = i+1;
  839. code = 0;
  840. break;
  841. }
  842. i++;
  843. }
  844. if (i >= buflen)
  845. code = 1;
  846. }
  847. }
  848. else if (buf[i] == '[') {
  849. code = 0;
  850. end = i+1;
  851. type = marktype;
  852. }
  853. else if (buf[i] == '/') {
  854. /* name */
  855. type = nametype;
  856. i++;
  857. while (i < buflen) {
  858. if (is_white(buf[i]) || is_delimiter(buf[i])) {
  859. end = i;
  860. code = 0;
  861. break;
  862. }
  863. i++;
  864. }
  865. if (i >= buflen)
  866. code = 1;
  867. }
  868. else if (is_delimiter(buf[i])) {
  869. /* skip over delimiter */
  870. if (buf[i] == '>') {
  871. i++;
  872. if (i < buflen) {
  873. if (buf[i] == '>') {
  874. type = optype;
  875. end = i+1;
  876. code = 0;
  877. }
  878. else
  879. code = -1;
  880. }
  881. }
  882. else {
  883. type = optype;
  884. end = i+1;
  885. code = 0;
  886. }
  887. if (i >= buflen)
  888. code = 1;
  889. }
  890. else {
  891. /* First assume that it is an op */
  892. type = optype;
  893. while (i < buflen) {
  894. if (is_white(buf[i]) || is_delimiter(buf[i])) {
  895. end = i;
  896. code = 0;
  897. break;
  898. }
  899. i++;
  900. }
  901. if (i >= buflen)
  902. code = 1;
  903. /* try to convert it into a bool */
  904. if ((code == 0) && (type == optype)) {
  905. if ((end - begin == 4) &&
  906. (memcmp(buf+begin, "true", 4) == 0)) {
  907. type = booltype;
  908. }
  909. else if ((end - begin == 5) &&
  910. (memcmp(buf+begin, "false", 5) == 0)) {
  911. type = booltype;
  912. }
  913. }
  914. /* try to convert it into an integer */
  915. if ((code == 0) && (type == optype)) {
  916. int j;
  917. char ch;
  918. BOOL isreal = FALSE;
  919. BOOL isnum = TRUE;
  920. for (j=begin; j<end; j++) {
  921. ch = buf[j];
  922. if (ch == '.')
  923. isreal = TRUE;
  924. if (!((ch == '-') || (ch == '+') || (ch == '.') ||
  925. isdigit((int)ch)))
  926. isnum = FALSE;
  927. }
  928. if (isnum) {
  929. if (isreal)
  930. type = realtype;
  931. else
  932. type = integertype;
  933. }
  934. }
  935. }
  936. *ttype = type;
  937. *tbegin = begin;
  938. *tend = end;
  939. return code;
  940. }
  941. /*****************************************************************/
  942. static void pdf_scan_finish(PDFSCAN *ps)
  943. {
  944. if (ps->file) {
  945. fclose(ps->file);
  946. ps->file = NULL;
  947. }
  948. if (ps->buf) {
  949. free(ps->buf);
  950. ps->buf = NULL;
  951. }
  952. ps->buflen = 0;
  953. if (ps->xref) {
  954. free(ps->xref);
  955. ps->xref = NULL;
  956. }
  957. ps->xref_len = 0;
  958. if (ps->ostack) {
  959. free(ps->ostack);
  960. ps->ostack = NULL;
  961. }
  962. ps->ostack_len = 0;
  963. ps->ostack_idx = 0;
  964. if (ps->objs) {
  965. free(ps->objs);
  966. ps->objs = NULL;
  967. }
  968. ps->objs_len = 0;
  969. ps->objs_count = 0;
  970. memset(ps, 0, sizeof(PDFSCAN));
  971. }
  972. static int pdf_scan_open_file(PDFSCAN *ps)
  973. {
  974. ps->file = csfopen(ps->filename, TEXT("rb"));
  975. if (ps->file == NULL)
  976. return -1;
  977. return 0;
  978. }
  979. static int pdf_scan_init(PDFSCAN *ps, const TCHAR *name)
  980. {
  981. int len = (int)(cslen(name)+1) * sizeof(TCHAR);
  982. if (len > (int)sizeof(ps->filename))
  983. return -1;
  984. memcpy(ps->filename, name, len);
  985. if (pdf_scan_open_file(ps) != 0)
  986. return -1;
  987. ps->buflen = 256;
  988. ps->buf = (char *)malloc(ps->buflen);
  989. if (ps->buf == NULL) {
  990. pdf_scan_finish(ps);
  991. return -2;
  992. }
  993. ps->ostack_maxlen = 4096;
  994. ps->ostack_len = 256;
  995. ps->ostack_idx = 0; /* empty */
  996. ps->ostack = (ref *)malloc(ps->ostack_len * sizeof(ref));
  997. if (ps->ostack == NULL) {
  998. pdf_scan_finish(ps);
  999. return -2;
  1000. }
  1001. /* make first item on stack invalid */
  1002. ps->ostack[0].type = invalidtype;
  1003. ps->ostack[0].rsize = 0;
  1004. ps->ostack[0].value.voidval = NULL;
  1005. /* object cache */
  1006. ps->objs_maxlen = 1024;
  1007. ps->objs_len = 256;
  1008. ps->objs_count = 0; /* empty */
  1009. ps->objs = (ref *)malloc(ps->objs_len * sizeof(ref));
  1010. if (ps->objs == NULL) {
  1011. pdf_scan_finish(ps);
  1012. return -2;
  1013. }
  1014. ps->pagenum = -1; /* no cached media info yet */
  1015. return 0;
  1016. }
  1017. static int pdf_scan_seek(PDFSCAN *ps, long offset, PDFSEEK whence)
  1018. {
  1019. int code = -1;
  1020. switch (whence) {
  1021. case PDFSEEK_CUR:
  1022. offset = ps->offset + ps->end + offset;
  1023. case PDFSEEK_SET:
  1024. ps->begin = ps->end = ps->len = 0;
  1025. code = fseek(ps->file, offset, SEEK_SET);
  1026. ps->offset = offset;
  1027. break;
  1028. case PDFSEEK_END:
  1029. code = fseek(ps->file, 0, SEEK_END);
  1030. ps->begin = ps->end = ps->len = 0;
  1031. ps->offset = ftell(ps->file);
  1032. break;
  1033. }
  1034. return code;
  1035. }
  1036. /* Read next token from PDF file */
  1037. /* Return 0 if OK, or -1 if EOF, -2 if error */
  1038. /* Set *token_type to token type */
  1039. static int pdf_scan_next_token(PDFSCAN *ps)
  1040. {
  1041. int code = 0;
  1042. int count;
  1043. rtype type=invalidtype;
  1044. int begin=0, end=0;
  1045. do {
  1046. if ((code == 1) && ps->end) {
  1047. /* move characters to front of buffer */
  1048. if (ps->len - ps->end)
  1049. memmove(ps->buf, ps->buf+ps->end, ps->len - ps->end);
  1050. ps->offset += ps->end;
  1051. ps->len = ps->len - ps->end;
  1052. ps->begin = 0;
  1053. ps->end = 0;
  1054. }
  1055. if ((code == 1) && (ps->len >= ps->buflen)) {
  1056. /* increase buffer size */
  1057. char *newbuf;
  1058. int newbuflen = 2 * ps->buflen;
  1059. newbuf = (char *)malloc(newbuflen);
  1060. if (newbuf) {
  1061. memcpy(newbuf, ps->buf, ps->buflen);
  1062. free(ps->buf);
  1063. ps->buf = newbuf;
  1064. ps->buflen = newbuflen;
  1065. }
  1066. else {
  1067. pdf_scan_msgf(ps, "Out of memory in pdf_scan_next_token\n");
  1068. pdf_scan_msgf(ps, "Tried to realloc %d to %d\n",
  1069. ps->buflen, newbuflen);
  1070. code = -2;
  1071. break;
  1072. }
  1073. }
  1074. if ((code == 1) || (ps->len == 0)) {
  1075. count = (int)fread(ps->buf+ps->len, 1, ps->buflen-ps->len,
  1076. ps->file);
  1077. if (count == 0) {
  1078. pdf_scan_msgf(ps, "EOF in pdf_scan_next_token\n");
  1079. code = -1;
  1080. break;
  1081. }
  1082. ps->len += count;
  1083. }
  1084. while (ps->instream) {
  1085. /* We are in a stream. Keep reading until we find
  1086. * the endstream. This isn't robust. It can be fooled
  1087. * by "endstream" occuring within a stream.
  1088. */
  1089. while ((ps->end < ps->len) && (ps->buf[ps->end] != 'e'))
  1090. ps->end++;
  1091. /* look for endstream */
  1092. if (ps->end + 9 >= ps->len) {
  1093. code = 1; /* need more */
  1094. break;
  1095. }
  1096. if (memcmp(ps->buf+ps->end, "endstream", 9) == 0)
  1097. ps->instream = FALSE;
  1098. else
  1099. ps->end++;
  1100. }
  1101. if (!ps->instream)
  1102. code = pdf_scan_token(ps->buf+ps->end, ps->len - ps->end,
  1103. &type, &begin, &end);
  1104. } while (code == 1);
  1105. if (code == 0) {
  1106. /* got a token */
  1107. ps->begin = ps->end + begin;
  1108. ps->end = ps->end + end;
  1109. ps->token_type = type;
  1110. if ((type == optype) && (ps->end-ps->begin == 6) &&
  1111. (memcmp(ps->buf+ps->begin, "stream", 6) == 0))
  1112. ps->instream = TRUE;
  1113. }
  1114. return code;
  1115. }
  1116. /*****************************************************************/
  1117. /* Reading %%EOF, xref, traler */
  1118. static int
  1119. previous_line(const char *str, int len)
  1120. {
  1121. int i = len-1;
  1122. /* first skip over EOL */
  1123. while ((i > 0) && ((str[i]=='\r') || (str[i]=='\n')))
  1124. i--;
  1125. while ((i > 0) && !((str[i]=='\r') || (str[i]=='\n')))
  1126. i--;
  1127. if (!((str[i]=='\r') || (str[i]=='\n')))
  1128. return -1; /* didn't find a line */
  1129. return i+1;
  1130. }
  1131. static int
  1132. pdf_scan_find_xref(PDFSCAN *ps)
  1133. {
  1134. char buf[4096];
  1135. int i, j;
  1136. int code = -1;
  1137. int count;
  1138. pdf_scan_seek(ps, 0, PDFSEEK_END);
  1139. count = min((int)sizeof(buf), ps->offset);
  1140. pdf_scan_seek(ps, -count, PDFSEEK_CUR);
  1141. count = (int)fread(buf, 1, sizeof(buf), ps->file);
  1142. pdf_scan_seek(ps, 0, PDFSEEK_SET);
  1143. if (count == 0)
  1144. return -1;
  1145. i = count - 5;
  1146. while (i > 0) {
  1147. /* Find %%EOF */
  1148. if (memcmp(buf+i, "%%EOF", 5) == 0) {
  1149. code = 0;
  1150. break;
  1151. }
  1152. i--;
  1153. }
  1154. if (i == 0) {
  1155. pdf_scan_msgf(ps, "Failed to find %%EOF\n");
  1156. code = -1;
  1157. }
  1158. if (code == 0) {
  1159. /* Look for xref table offset */
  1160. j = previous_line(buf, i);
  1161. if (j >= 0)
  1162. ps->xref_offset = atol(buf+j);
  1163. else
  1164. code = -1;
  1165. i = j;
  1166. if (ps->xref_offset == 0)
  1167. code = -1;
  1168. if (code != 0)
  1169. pdf_scan_msgf(ps, "Failed to find cross reference table\n");
  1170. }
  1171. if (code == 0) {
  1172. /* Look for "startxref" */
  1173. j = previous_line(buf, i);
  1174. if (j >= 0) {
  1175. if (memcmp(buf+j, "startxref", 9) != 0)
  1176. code = -1;
  1177. }
  1178. else {
  1179. code = -1;
  1180. }
  1181. if (code != 0)
  1182. pdf_scan_msgf(ps, "Failed to find startxref\n");
  1183. }
  1184. return code;
  1185. }
  1186. /* Read a cross reference table */
  1187. /* This is called for each cross reference table */
  1188. static int
  1189. pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset)
  1190. {
  1191. int code;
  1192. int i;
  1193. int first = 0;
  1194. int count = 0;
  1195. unsigned long prev = 0;
  1196. unsigned long offset = 0;
  1197. int generation = 0;
  1198. BOOL used = FALSE;
  1199. pdf_scan_seek(ps, xref_offset, PDFSEEK_SET);
  1200. code = pdf_scan_next_token(ps);
  1201. if (code == 0)
  1202. code = op_check(ps, "xref");
  1203. while (code == 0) {
  1204. code = pdf_scan_next_token(ps);
  1205. if ((code == 0) && is_optoken(ps, "trailer"))
  1206. break; /* finished this xref table */
  1207. if (code == 0) {
  1208. first = atoi(ps->buf + ps->begin);
  1209. code = pdf_scan_next_token(ps);
  1210. }
  1211. if (code == 0) {
  1212. count = atoi(ps->buf + ps->begin);
  1213. }
  1214. if (code == 0) {
  1215. /* make sure there is enough space in the table */
  1216. if (first + count > ps->xref_len) {
  1217. int len = (first + count) * sizeof(PDFXREF);
  1218. PDFXREF *newxref = (PDFXREF *)malloc(len);
  1219. if (newxref) {
  1220. memset(newxref, 0, len);
  1221. memcpy(newxref, ps->xref, ps->xref_len * sizeof(PDFXREF));
  1222. free(ps->xref);
  1223. ps->xref = newxref;
  1224. ps->xref_len = first + count;
  1225. }
  1226. else {
  1227. pdf_scan_msgf(ps, "pdf_scan_read_xref: out of memory\n");
  1228. code = -2;
  1229. break;
  1230. }
  1231. }
  1232. }
  1233. for (i=first; i<first+count; i++) {
  1234. code = pdf_scan_next_token(ps);
  1235. if (code == 0) {
  1236. offset = atol(ps->buf+ps->begin);
  1237. code = pdf_scan_next_token(ps);
  1238. }
  1239. if (code == 0) {
  1240. generation = atoi(ps->buf+ps->begin);
  1241. code = pdf_scan_next_token(ps);
  1242. }
  1243. if (code == 0) {
  1244. if (is_optoken(ps, "n"))
  1245. used = TRUE;
  1246. else if (is_optoken(ps, "f"))
  1247. used = FALSE;
  1248. else
  1249. code = -1;
  1250. }
  1251. /* We don't deal correctly with generation.
  1252. * We assume that the first xref table that marks an
  1253. * object as used is the definitive reference.
  1254. */
  1255. if (code == 0) {
  1256. if (!(ps->xref[i].used)) {
  1257. ps->xref[i].offset = offset;
  1258. ps->xref[i].generation = generation;
  1259. ps->xref[i].used = used;
  1260. }
  1261. }
  1262. }
  1263. }
  1264. if (code == 0) {
  1265. code = pdf_scan_read_trailer(ps, &prev);
  1266. if ((code == 0) && prev && prev != ps->xref_offset) {
  1267. /* read older xref and trailer */
  1268. code = pdf_scan_read_xref(ps, prev);
  1269. }
  1270. }
  1271. return code;
  1272. }
  1273. /* Read a trailer */
  1274. static int
  1275. pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev)
  1276. {
  1277. int code = 0;
  1278. ref p;
  1279. code = pdf_scan_next_token(ps);
  1280. if ((code == 0) && (ps->token_type != marktype))
  1281. code = -1;
  1282. push_token(ps);
  1283. while (code == 0) {
  1284. code = pdf_scan_next_token(ps);
  1285. if (code != 0)
  1286. break;
  1287. if (is_optoken(ps, "startxref")) {
  1288. if (ps->root == 0) {
  1289. p = dict_get(ps, "Root");
  1290. if (p.type == objtype)
  1291. ps->root = p.value.objval;
  1292. else {
  1293. pdf_scan_msgf(ps,
  1294. "trailer /Root requires indirect reference\n");
  1295. code = -1;
  1296. }
  1297. }
  1298. p = dict_get(ps, "Prev");
  1299. if (p.type == integertype)
  1300. *prev = p.value.intval;
  1301. else if (p.type != invalidtype) {
  1302. code = -1;
  1303. pdf_scan_msgf(ps, "trailer /Prev requires integer\n");
  1304. }
  1305. break;
  1306. }
  1307. if (process_op(ps) != 0)
  1308. push_token(ps);
  1309. }
  1310. if (code != 0)
  1311. pdf_scan_msgf(ps, "Error reading trailer\n");
  1312. return code;
  1313. }
  1314. static int pdf_scan_read_object_start(PDFSCAN *ps, int objnum)
  1315. {
  1316. int code = 0;
  1317. int value = 0;
  1318. if (objnum == 0) {
  1319. pdf_scan_msgf(ps, "Object 0 is always unused\n");
  1320. return -1;
  1321. }
  1322. if (objnum >= ps->xref_len) {
  1323. pdf_scan_msgf(ps, "Object reference %d doesn't exist. There are only %d objects\n", objnum, ps->xref_len);
  1324. return -1;
  1325. }
  1326. if (!ps->xref[objnum].used) {
  1327. pdf_scan_msgf(ps, "Object %d is unused\n", objnum);
  1328. return -1;
  1329. }
  1330. pdf_scan_seek(ps, ps->xref[objnum].offset, PDFSEEK_SET);
  1331. code = pdf_scan_next_token(ps); /* object number */
  1332. if (code == 0)
  1333. code = type_check(ps, integertype);
  1334. if (code == 0) {
  1335. value = atoi(ps->buf+ps->begin); /* object number */
  1336. code = pdf_scan_next_token(ps); /* generation */
  1337. }
  1338. if (code == 0)
  1339. code = type_check(ps, integertype);
  1340. if (code == 0)
  1341. code = pdf_scan_next_token(ps); /* obj */
  1342. if (code == 0)
  1343. code = op_check(ps, "obj");
  1344. if (value != objnum) {
  1345. pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
  1346. return -1;
  1347. }
  1348. return code;
  1349. }
  1350. /*****************************************************************/
  1351. /* Read an object, and leave it on the stack */
  1352. static int
  1353. pdf_scan_read_object(PDFSCAN *ps, int objnum)
  1354. {
  1355. int code;
  1356. ref objref = obj_find(ps, objnum);
  1357. if (objref.type != invalidtype) {
  1358. /* found in cache */
  1359. push_stack(ps, objref);
  1360. return 0;
  1361. }
  1362. code = pdf_scan_read_object_start(ps, objnum);
  1363. if (code) {
  1364. pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
  1365. return -1;
  1366. }
  1367. code = pdf_scan_next_token(ps);
  1368. if ((code == 0) && (ps->token_type != marktype))
  1369. code = -1;
  1370. push_token(ps);
  1371. while (code == 0) {
  1372. code = pdf_scan_next_token(ps);
  1373. if (code != 0)
  1374. break;
  1375. if (is_optoken(ps, "endobj")) {
  1376. obj_add(ps, objnum, top_stack(ps));
  1377. break;
  1378. }
  1379. if (process_op(ps) != 0)
  1380. push_token(ps);
  1381. }
  1382. return code;
  1383. }
  1384. /*****************************************************************/
  1385. /* find the object number for a page */
  1386. /* Return <= 0 if failure, or object number */
  1387. /* First page is 0 */
  1388. static int pdf_scan_find_page(PDFSCAN *ps, int pagenum)
  1389. {
  1390. int code;
  1391. ref kids;
  1392. ref r;
  1393. int pageobj = 0;
  1394. int count_base = 0;
  1395. int count;
  1396. ref *pref;
  1397. int i;
  1398. int inext;
  1399. if (pagenum >= ps->page_count) {
  1400. pdf_scan_msgf(ps, "Not that many pages\n");
  1401. return -1;
  1402. }
  1403. code = pdf_scan_read_object(ps, ps->pages);
  1404. if (code) {
  1405. pdf_scan_msgf(ps, "Didn't find Pages object\n");
  1406. return -1;
  1407. }
  1408. /* iterate through Kids, looking for the one that includes this page */
  1409. kids = dict_get(ps, "Kids");
  1410. if (kids.type != arraytype) {
  1411. pdf_scan_msgf(ps, "/Pages object %d must contain /Kids array\n",
  1412. ps->pages);
  1413. return -1;
  1414. }
  1415. pop_stack(ps); /* First Pages */
  1416. for (i = 0; (i < kids.rsize) && (code == 0); i=inext) {
  1417. inext = i+1;
  1418. pref = &kids.value.arrayval[i];
  1419. if (pref->type == objtype)
  1420. code = pdf_scan_read_object(ps, pref->value.objval);
  1421. if (code == 0) {
  1422. r = dict_get(ps, "Type");
  1423. if (nameref_equals(&r, "Page")) {
  1424. if (count_base + i == pagenum) {
  1425. /* this is it */
  1426. pageobj = pref->value.objval;
  1427. pop_stack(ps); /* the wanted page */
  1428. break;
  1429. }
  1430. }
  1431. else if (nameref_equals(&r, "Pages")) {
  1432. r = dict_get(ps, "Count");
  1433. if (r.type == integertype) {
  1434. count = r.value.intval;
  1435. if (pagenum < count_base + count) {
  1436. /* It's under this child */
  1437. inext = 0;
  1438. pop_stack(ps); /* The old /Pages */
  1439. code = pdf_scan_read_object(ps, pref->value.objval);
  1440. if (code == 0) {
  1441. kids = dict_get(ps, "Kids");
  1442. if (kids.type != arraytype) {
  1443. pdf_scan_msgf(ps,
  1444. "/Pages object %d must contain /Kids array\n",
  1445. pref->value.objval);
  1446. code = -1;
  1447. }
  1448. }
  1449. }
  1450. else {
  1451. count_base += count;
  1452. }
  1453. }
  1454. else {
  1455. pdf_scan_msgf(ps, "/Pages /Count must be integer\n");
  1456. code = -1;
  1457. }
  1458. }
  1459. else {
  1460. pdf_scan_msgf(ps,
  1461. "pdf_scan_find_page: object %d isn't Pages or Page\n",
  1462. pref->value.objval);
  1463. code = -1;
  1464. }
  1465. pop_stack(ps);
  1466. }
  1467. }
  1468. if (pageobj <= 0) {
  1469. pdf_scan_msgf(ps, "Failed to find page %d\n", pagenum+1);
  1470. code = -1;
  1471. }
  1472. if (code)
  1473. return -1;
  1474. /* Don't clean up, since we will use the cached objects
  1475. * when extracting the page media.
  1476. */
  1477. return pageobj;
  1478. }
  1479. static int
  1480. pdf_scan_read_page_count(PDFSCAN *ps)
  1481. {
  1482. int code;
  1483. ref p;
  1484. code = pdf_scan_read_object(ps, ps->pages);
  1485. if (code) {
  1486. pdf_scan_msgf(ps, "Didn't find Pages object\n");
  1487. return -1;
  1488. }
  1489. p = dict_get(ps, "Type");
  1490. if (!nameref_equals(&p, "Pages")) {
  1491. pdf_scan_msgf(ps, "Pages object didn't have /Type /Pages\n");
  1492. return -1;
  1493. }
  1494. p = dict_get(ps, "Count");
  1495. if (p.type != integertype) {
  1496. pdf_scan_msgf(ps, "Pages object didn't integer /Count\n");
  1497. return -1;
  1498. }
  1499. ps->page_count = p.value.intval;
  1500. return code;
  1501. }
  1502. static int convert_float(ref r, float *f)
  1503. {
  1504. if (r.type == realtype)
  1505. *f = r.value.realval;
  1506. else if (r.type == integertype)
  1507. *f = (float)r.value.intval;
  1508. else
  1509. return -1;
  1510. return 0;
  1511. }
  1512. static int
  1513. pdf_scan_read_bbox(PDFBBOX *box, ref array)
  1514. {
  1515. int code = 0;
  1516. if (array.type != arraytype)
  1517. code = -1;
  1518. if (array.rsize != 4)
  1519. code = -1;
  1520. if (code == 0)
  1521. code = convert_float(array.value.arrayval[0], &box->llx);
  1522. if (code == 0)
  1523. code = convert_float(array.value.arrayval[1], &box->lly);
  1524. if (code == 0)
  1525. code = convert_float(array.value.arrayval[2], &box->urx);
  1526. if (code == 0)
  1527. code = convert_float(array.value.arrayval[3], &box->ury);
  1528. return code;
  1529. }
  1530. /* Read catalog and leave on stack */
  1531. static int
  1532. pdf_scan_read_catalog(PDFSCAN *ps)
  1533. {
  1534. int code;
  1535. ref p;
  1536. /* Read root object, making sure it is /Type /Catalog,
  1537. * and that /Pages is an indirect reference
  1538. */
  1539. code = pdf_scan_read_object(ps, ps->root);
  1540. if (code) {
  1541. pdf_scan_msgf(ps, "Didn't find Root object\n");
  1542. return -1;
  1543. }
  1544. p = dict_get(ps, "Type");
  1545. if (!nameref_equals(&p, "Catalog")) {
  1546. pdf_scan_msgf(ps, "Root object didn't have /Type /Catalog\n");
  1547. return -1;
  1548. }
  1549. p = dict_get(ps, "Pages");
  1550. if (p.type != objtype) {
  1551. pdf_scan_msgf(ps, "Root object didn't indirect reference to /Pages\n");
  1552. return -1;
  1553. }
  1554. ps->pages = p.value.intval;
  1555. return 0;
  1556. }
  1557. /*****************************************************************/
  1558. /* public functions */
  1559. void
  1560. pdf_scan_close(PDFSCAN *ps)
  1561. {
  1562. pdf_scan_cleanup(ps);
  1563. pdf_scan_finish(ps);
  1564. free(ps);
  1565. }
  1566. PDFSCAN *
  1567. pdf_scan_open(const TCHAR *filename, void *handle,
  1568. int (*fn)(void *handle, const char *ptr, int len))
  1569. {
  1570. int code;
  1571. int rotate;
  1572. PDFBBOX mediabox, cropbox;
  1573. PDFSCAN *ps = (PDFSCAN *)malloc(sizeof(PDFSCAN));
  1574. if (ps == NULL)
  1575. return NULL;
  1576. memset(ps, 0, sizeof(PDFSCAN));
  1577. ps->handle = handle;
  1578. ps->print_fn = fn;
  1579. code = pdf_scan_init(ps, filename);
  1580. if (code == -1)
  1581. pdf_scan_msgf(ps, "Couldn't open PDF file\n");
  1582. else if (code != 0)
  1583. pdf_scan_msgf(ps, "Error initialising PDF scanner\n");
  1584. if (code == 0)
  1585. code = pdf_scan_find_xref(ps);
  1586. if (code == 0)
  1587. code = pdf_scan_read_xref(ps, ps->xref_offset);
  1588. if (code == 0)
  1589. code = pdf_scan_read_catalog(ps);
  1590. if (code == 0)
  1591. code = pdf_scan_read_page_count(ps);
  1592. if (code == 0)
  1593. code = pdf_scan_page_media(ps, 0, &rotate, &mediabox, &cropbox);
  1594. pdf_scan_cleanup(ps);
  1595. if (code != 0) {
  1596. pdf_scan_close(ps);
  1597. ps = NULL;
  1598. }
  1599. return ps;
  1600. }
  1601. int
  1602. pdf_scan_page_count(PDFSCAN *ps)
  1603. {
  1604. if (ps == NULL)
  1605. return 0;
  1606. return ps->page_count;
  1607. }
  1608. int
  1609. pdf_scan_page_media(PDFSCAN *ps, int pagenum, int *rotate,
  1610. PDFBBOX *mediabox, PDFBBOX *cropbox)
  1611. {
  1612. BOOL found_rotate = FALSE;
  1613. BOOL found_mediabox = FALSE;
  1614. BOOL found_cropbox = FALSE;
  1615. BOOL has_parent = TRUE;
  1616. ref p, objref;
  1617. int objnum;
  1618. if (ps == NULL)
  1619. return -1;
  1620. if (pagenum == ps->pagenum) {
  1621. /* Used cached values */
  1622. *rotate = ps->rotate;
  1623. *mediabox = ps->mediabox;
  1624. *cropbox = ps->cropbox;
  1625. return 0;
  1626. }
  1627. if (ps->file == NULL) {
  1628. if (pdf_scan_open_file(ps) != 0)
  1629. return -1;
  1630. }
  1631. objnum = pdf_scan_find_page(ps, pagenum);
  1632. if (objnum <= 0) {
  1633. pdf_scan_cleanup(ps);
  1634. return -1;
  1635. }
  1636. if (pdf_scan_read_object(ps, objnum) < 0) {
  1637. pdf_scan_cleanup(ps);
  1638. return -1;
  1639. }
  1640. while (has_parent) {
  1641. if (!found_rotate) {
  1642. p = dict_get(ps, "Rotate");
  1643. if (p.type == integertype) {
  1644. *rotate = p.value.intval;
  1645. found_rotate = TRUE;
  1646. }
  1647. }
  1648. if (!found_mediabox) {
  1649. p = dict_get(ps, "MediaBox");
  1650. if (pdf_scan_read_bbox(mediabox, p) == 0)
  1651. found_mediabox = TRUE;
  1652. }
  1653. if (!found_cropbox) {
  1654. p = dict_get(ps, "CropBox");
  1655. if (pdf_scan_read_bbox(cropbox, p) == 0)
  1656. found_cropbox = TRUE;
  1657. }
  1658. if (found_rotate && found_mediabox && found_cropbox)
  1659. break;
  1660. p = dict_get(ps, "Parent");
  1661. if (p.type == objtype) {
  1662. objref = pop_stack(ps);
  1663. if (pdf_scan_read_object(ps, p.value.objval) < 0) {
  1664. push_stack(ps, objref);
  1665. has_parent = FALSE;
  1666. }
  1667. }
  1668. else
  1669. has_parent = FALSE;
  1670. }
  1671. pop_stack(ps);
  1672. if (!found_cropbox) {
  1673. *cropbox = *mediabox;
  1674. found_cropbox = TRUE;
  1675. }
  1676. if (!found_rotate) {
  1677. *rotate = 0;
  1678. found_rotate = TRUE;
  1679. }
  1680. pdf_scan_cleanup(ps);
  1681. if (found_rotate && found_mediabox && found_cropbox) {
  1682. /* cache these values */
  1683. ps->pagenum = pagenum;
  1684. ps->rotate = *rotate;
  1685. ps->mediabox = *mediabox;
  1686. ps->cropbox = *cropbox;
  1687. return 0;
  1688. }
  1689. return -1;
  1690. }
  1691. /*****************************************************************/
  1692. #ifdef DEMO_PDFSCAN
  1693. int test_print_fn(void *handle, const char *ptr, int len)
  1694. {
  1695. fwrite(ptr, 1, len, stdout);
  1696. return len;
  1697. }
  1698. int main(int argc, char *argv[])
  1699. {
  1700. PDFSCAN *ps;
  1701. int i, count;
  1702. int code;
  1703. PDFBBOX mediabox, cropbox;
  1704. int rotate;
  1705. if (argc < 2) {
  1706. fprintf(stdout, "Usage: cpdfscan filename\n");
  1707. return 1;
  1708. }
  1709. ps = pdf_scan_open(argv[1], NULL, test_print_fn);
  1710. if (ps) {
  1711. count = pdf_scan_page_count(ps);
  1712. pdf_scan_msgf(ps, "Page count is %d\n", count);
  1713. for (i=0; i<count; i++) {
  1714. code = pdf_scan_page_media(ps, i, &rotate, &mediabox, &cropbox);
  1715. if (code == 0) {
  1716. fprintf(stdout, "Page %d /Rotate %d ", i+1, rotate);
  1717. fprintf(stdout, "/MediaBox [%g %g %g %g] /CropBox [%g %g %g %g]\n",
  1718. mediabox.llx, mediabox.lly, mediabox.urx, mediabox.ury,
  1719. cropbox.llx, cropbox.lly, cropbox.urx, cropbox.ury);
  1720. }
  1721. else
  1722. fprintf(stdout, "Page %d media unknown\n", i+1);
  1723. }
  1724. pdf_scan_close(ps);
  1725. }
  1726. return 0;
  1727. }
  1728. #endif