PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/htdocs/includes/fpdfi/pdf_parser.php

https://bitbucket.org/speedealing/speedealing
PHP | 719 lines | 411 code | 140 blank | 168 comment | 131 complexity | 7f2fcf98013f7830e1e8b3c1e26e6a95 MD5 | raw file
Possible License(s): LGPL-3.0, LGPL-2.1, GPL-3.0, MIT
  1. <?php
  2. //
  3. // FPDI - Version 1.4.2
  4. //
  5. // Copyright 2004-2011 Setasign - Jan Slabon
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS,
  15. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. // See the License for the specific language governing permissions and
  17. // limitations under the License.
  18. //
  19. if (!defined ('PDF_TYPE_NULL'))
  20. define ('PDF_TYPE_NULL', 0);
  21. if (!defined ('PDF_TYPE_NUMERIC'))
  22. define ('PDF_TYPE_NUMERIC', 1);
  23. if (!defined ('PDF_TYPE_TOKEN'))
  24. define ('PDF_TYPE_TOKEN', 2);
  25. if (!defined ('PDF_TYPE_HEX'))
  26. define ('PDF_TYPE_HEX', 3);
  27. if (!defined ('PDF_TYPE_STRING'))
  28. define ('PDF_TYPE_STRING', 4);
  29. if (!defined ('PDF_TYPE_DICTIONARY'))
  30. define ('PDF_TYPE_DICTIONARY', 5);
  31. if (!defined ('PDF_TYPE_ARRAY'))
  32. define ('PDF_TYPE_ARRAY', 6);
  33. if (!defined ('PDF_TYPE_OBJDEC'))
  34. define ('PDF_TYPE_OBJDEC', 7);
  35. if (!defined ('PDF_TYPE_OBJREF'))
  36. define ('PDF_TYPE_OBJREF', 8);
  37. if (!defined ('PDF_TYPE_OBJECT'))
  38. define ('PDF_TYPE_OBJECT', 9);
  39. if (!defined ('PDF_TYPE_STREAM'))
  40. define ('PDF_TYPE_STREAM', 10);
  41. if (!defined ('PDF_TYPE_BOOLEAN'))
  42. define ('PDF_TYPE_BOOLEAN', 11);
  43. if (!defined ('PDF_TYPE_REAL'))
  44. define ('PDF_TYPE_REAL', 12);
  45. require_once('pdf_context.php');
  46. if (!class_exists('pdf_parser', false)) {
  47. class pdf_parser {
  48. /**
  49. * Filename
  50. * @var string
  51. */
  52. var $filename;
  53. /**
  54. * File resource
  55. * @var resource
  56. */
  57. var $f;
  58. /**
  59. * PDF Context
  60. * @var object pdf_context-Instance
  61. */
  62. var $c;
  63. /**
  64. * xref-Data
  65. * @var array
  66. */
  67. var $xref;
  68. /**
  69. * root-Object
  70. * @var array
  71. */
  72. var $root;
  73. /**
  74. * PDF version of the loaded document
  75. * @var string
  76. */
  77. var $pdfVersion;
  78. /**
  79. * For reading encrypted documents and xref/objectstreams are in use
  80. *
  81. * @var boolean
  82. */
  83. var $readPlain = true;
  84. /**
  85. * Constructor
  86. *
  87. * @param string $filename Source-Filename
  88. */
  89. function pdf_parser($filename) {
  90. $this->filename = $filename;
  91. $this->f = @fopen($this->filename, 'rb');
  92. if (!$this->f)
  93. $this->error(sprintf('Cannot open %s !', $filename));
  94. $this->getPDFVersion();
  95. $this->c = new pdf_context($this->f);
  96. // Read xref-Data
  97. $this->xref = array();
  98. $this->pdf_read_xref($this->xref, $this->pdf_find_xref());
  99. // Check for Encryption
  100. $this->getEncryption();
  101. // Read root
  102. $this->pdf_read_root();
  103. }
  104. /**
  105. * Close the opened file
  106. */
  107. function closeFile() {
  108. if (isset($this->f) && is_resource($this->f)) {
  109. fclose($this->f);
  110. unset($this->f);
  111. }
  112. }
  113. /**
  114. * Print Error and die
  115. *
  116. * @param string $msg Error-Message
  117. */
  118. function error($msg) {
  119. die('<b>PDF-Parser Error:</b> ' . $msg);
  120. }
  121. /**
  122. * Check Trailer for Encryption
  123. */
  124. function getEncryption() {
  125. if (isset($this->xref['trailer'][1]['/Encrypt'])) {
  126. $this->error('File is encrypted!');
  127. }
  128. }
  129. /**
  130. * Find/Return /Root
  131. *
  132. * @return array
  133. */
  134. function pdf_find_root() {
  135. if ($this->xref['trailer'][1]['/Root'][0] != PDF_TYPE_OBJREF) {
  136. $this->error('Wrong Type of Root-Element! Must be an indirect reference');
  137. }
  138. return $this->xref['trailer'][1]['/Root'];
  139. }
  140. /**
  141. * Read the /Root
  142. */
  143. function pdf_read_root() {
  144. // read root
  145. $this->root = $this->pdf_resolve_object($this->c, $this->pdf_find_root());
  146. }
  147. /**
  148. * Get PDF-Version
  149. *
  150. * And reset the PDF Version used in FPDI if needed
  151. */
  152. function getPDFVersion() {
  153. fseek($this->f, 0);
  154. preg_match('/\d\.\d/',fread($this->f, 16), $m);
  155. if (isset($m[0]))
  156. $this->pdfVersion = $m[0];
  157. return $this->pdfVersion;
  158. }
  159. /**
  160. * Find the xref-Table
  161. */
  162. function pdf_find_xref() {
  163. $toRead = 1500;
  164. $stat = fseek ($this->f, -$toRead, SEEK_END);
  165. if ($stat === -1) {
  166. fseek ($this->f, 0);
  167. }
  168. $data = fread($this->f, $toRead);
  169. $pos = strlen($data) - strpos(strrev($data), strrev('startxref'));
  170. $data = substr($data, $pos);
  171. if (!preg_match('/\s*(\d+).*$/s', $data, $matches)) {
  172. $this->error('Unable to find pointer to xref table');
  173. }
  174. return (int) $matches[1];
  175. }
  176. /**
  177. * Read xref-table
  178. *
  179. * @param array $result Array of xref-table
  180. * @param integer $offset of xref-table
  181. */
  182. function pdf_read_xref(&$result, $offset) {
  183. $o_pos = $offset-min(20, $offset);
  184. fseek($this->f, $o_pos); // set some bytes backwards to fetch errorious docs
  185. $data = fread($this->f, 100);
  186. $xrefPos = strrpos($data, 'xref');
  187. if ($xrefPos === false) {
  188. fseek($this->f, $offset);
  189. $c = new pdf_context($this->f);
  190. $xrefStreamObjDec = $this->pdf_read_value($c);
  191. if (is_array($xrefStreamObjDec) && isset($xrefStreamObjDec[0]) && $xrefStreamObjDec[0] == PDF_TYPE_OBJDEC) {
  192. $this->error(sprintf('This document (%s) probably uses a compression technique which is not supported by the free parser shipped with FPDI.', $this->filename));
  193. } else {
  194. $this->error('Unable to find xref table.');
  195. }
  196. }
  197. if (!isset($result['xref_location'])) {
  198. $result['xref_location'] = $o_pos + $xrefPos;
  199. $result['max_object'] = 0;
  200. }
  201. $cylces = -1;
  202. $bytesPerCycle = 100;
  203. fseek($this->f, $o_pos = $o_pos + $xrefPos + 4); // set the handle directly after the "xref"-keyword
  204. $data = fread($this->f, $bytesPerCycle);
  205. while (($trailerPos = strpos($data, 'trailer', max($bytesPerCycle * $cylces++, 0))) === false && !feof($this->f)) {
  206. $data .= fread($this->f, $bytesPerCycle);
  207. }
  208. if ($trailerPos === false) {
  209. $this->error('Trailer keyword not found after xref table');
  210. }
  211. $data = substr($data, 0, $trailerPos);
  212. // get Line-Ending
  213. preg_match_all("/(\r\n|\n|\r)/", substr($data, 0, 100), $m); // check the first 100 bytes for linebreaks
  214. $differentLineEndings = count(array_unique($m[0]));
  215. if ($differentLineEndings > 1) {
  216. $lines = preg_split("/(\r\n|\n|\r)/", $data, -1, PREG_SPLIT_NO_EMPTY);
  217. } else {
  218. $lines = explode($m[0][1], $data);
  219. }
  220. $data = $differentLineEndings = $m = null;
  221. unset($data, $differentLineEndings, $m);
  222. $linesCount = count($lines);
  223. $start = 1;
  224. for ($i = 0; $i < $linesCount; $i++) {
  225. $line = trim($lines[$i]);
  226. if ($line) {
  227. $pieces = explode(' ', $line);
  228. $c = count($pieces);
  229. switch($c) {
  230. case 2:
  231. $start = (int)$pieces[0];
  232. $end = $start + (int)$pieces[1];
  233. if ($end > $result['max_object'])
  234. $result['max_object'] = $end;
  235. break;
  236. case 3:
  237. if (!isset($result['xref'][$start]))
  238. $result['xref'][$start] = array();
  239. if (!array_key_exists($gen = (int) $pieces[1], $result['xref'][$start])) {
  240. $result['xref'][$start][$gen] = $pieces[2] == 'n' ? (int) $pieces[0] : null;
  241. }
  242. $start++;
  243. break;
  244. default:
  245. $this->error('Unexpected data in xref table');
  246. }
  247. }
  248. }
  249. $lines = $pieces = $line = $start = $end = $gen = null;
  250. unset($lines, $pieces, $line, $start, $end, $gen);
  251. fseek($this->f, $o_pos + $trailerPos + 7);
  252. $c = new pdf_context($this->f);
  253. $trailer = $this->pdf_read_value($c);
  254. $c = null;
  255. unset($c);
  256. if (!isset($result['trailer'])) {
  257. $result['trailer'] = $trailer;
  258. }
  259. if (isset($trailer[1]['/Prev'])) {
  260. $this->pdf_read_xref($result, $trailer[1]['/Prev'][1]);
  261. }
  262. $trailer = null;
  263. unset($trailer);
  264. return true;
  265. }
  266. /**
  267. * Reads an Value
  268. *
  269. * @param object $c pdf_context
  270. * @param string $token a Token
  271. * @return mixed
  272. */
  273. function pdf_read_value(&$c, $token = null) {
  274. if (is_null($token)) {
  275. $token = $this->pdf_read_token($c);
  276. }
  277. if ($token === false) {
  278. return false;
  279. }
  280. switch ($token) {
  281. case '<':
  282. // This is a hex string.
  283. // Read the value, then the terminator
  284. $pos = $c->offset;
  285. while(1) {
  286. $match = strpos ($c->buffer, '>', $pos);
  287. // If you can't find it, try
  288. // reading more data from the stream
  289. if ($match === false) {
  290. if (!$c->increase_length()) {
  291. return false;
  292. } else {
  293. continue;
  294. }
  295. }
  296. $result = substr ($c->buffer, $c->offset, $match - $c->offset);
  297. $c->offset = $match + 1;
  298. return array (PDF_TYPE_HEX, $result);
  299. }
  300. break;
  301. case '<<':
  302. // This is a dictionary.
  303. $result = array();
  304. // Recurse into this function until we reach
  305. // the end of the dictionary.
  306. while (($key = $this->pdf_read_token($c)) !== '>>') {
  307. if ($key === false) {
  308. return false;
  309. }
  310. if (($value = $this->pdf_read_value($c)) === false) {
  311. return false;
  312. }
  313. // Catch missing value
  314. if ($value[0] == PDF_TYPE_TOKEN && $value[1] == '>>') {
  315. $result[$key] = array(PDF_TYPE_NULL);
  316. break;
  317. }
  318. $result[$key] = $value;
  319. }
  320. return array (PDF_TYPE_DICTIONARY, $result);
  321. case '[':
  322. // This is an array.
  323. $result = array();
  324. // Recurse into this function until we reach
  325. // the end of the array.
  326. while (($token = $this->pdf_read_token($c)) !== ']') {
  327. if ($token === false) {
  328. return false;
  329. }
  330. if (($value = $this->pdf_read_value($c, $token)) === false) {
  331. return false;
  332. }
  333. $result[] = $value;
  334. }
  335. return array (PDF_TYPE_ARRAY, $result);
  336. case '(' :
  337. // This is a string
  338. $pos = $c->offset;
  339. $openBrackets = 1;
  340. do {
  341. for (; $openBrackets != 0 && $pos < $c->length; $pos++) {
  342. switch (ord($c->buffer[$pos])) {
  343. case 0x28: // '('
  344. $openBrackets++;
  345. break;
  346. case 0x29: // ')'
  347. $openBrackets--;
  348. break;
  349. case 0x5C: // backslash
  350. $pos++;
  351. }
  352. }
  353. } while($openBrackets != 0 && $c->increase_length());
  354. $result = substr($c->buffer, $c->offset, $pos - $c->offset - 1);
  355. $c->offset = $pos;
  356. return array (PDF_TYPE_STRING, $result);
  357. case 'stream':
  358. $o_pos = ftell($c->file)-strlen($c->buffer);
  359. $o_offset = $c->offset;
  360. $c->reset($startpos = $o_pos + $o_offset);
  361. $e = 0; // ensure line breaks in front of the stream
  362. if ($c->buffer[0] == chr(10) || $c->buffer[0] == chr(13))
  363. $e++;
  364. if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10))
  365. $e++;
  366. if ($this->actual_obj[1][1]['/Length'][0] == PDF_TYPE_OBJREF) {
  367. $tmp_c = new pdf_context($this->f);
  368. $tmp_length = $this->pdf_resolve_object($tmp_c, $this->actual_obj[1][1]['/Length']);
  369. $length = $tmp_length[1][1];
  370. } else {
  371. $length = $this->actual_obj[1][1]['/Length'][1];
  372. }
  373. if ($length > 0) {
  374. $c->reset($startpos + $e,$length);
  375. $v = $c->buffer;
  376. } else {
  377. $v = '';
  378. }
  379. $c->reset($startpos + $e + $length + 9); // 9 = strlen("endstream")
  380. return array(PDF_TYPE_STREAM, $v);
  381. default :
  382. if (is_numeric ($token)) {
  383. // A numeric token. Make sure that
  384. // it is not part of something else.
  385. if (($tok2 = $this->pdf_read_token ($c)) !== false) {
  386. if (is_numeric ($tok2)) {
  387. // Two numeric tokens in a row.
  388. // In this case, we're probably in
  389. // front of either an object reference
  390. // or an object specification.
  391. // Determine the case and return the data
  392. if (($tok3 = $this->pdf_read_token ($c)) !== false) {
  393. switch ($tok3) {
  394. case 'obj':
  395. return array (PDF_TYPE_OBJDEC, (int) $token, (int) $tok2);
  396. case 'R':
  397. return array (PDF_TYPE_OBJREF, (int) $token, (int) $tok2);
  398. }
  399. // If we get to this point, that numeric value up
  400. // there was just a numeric value. Push the extra
  401. // tokens back into the stack and return the value.
  402. array_push ($c->stack, $tok3);
  403. }
  404. }
  405. array_push ($c->stack, $tok2);
  406. }
  407. if ($token === (string)((int)$token))
  408. return array (PDF_TYPE_NUMERIC, (int)$token);
  409. else
  410. return array (PDF_TYPE_REAL, (float)$token);
  411. } else if ($token == 'true' || $token == 'false') {
  412. return array (PDF_TYPE_BOOLEAN, $token == 'true');
  413. } else if ($token == 'null') {
  414. return array (PDF_TYPE_NULL);
  415. } else {
  416. // Just a token. Return it.
  417. return array (PDF_TYPE_TOKEN, $token);
  418. }
  419. }
  420. }
  421. /**
  422. * Resolve an object
  423. *
  424. * @param object $c pdf_context
  425. * @param array $obj_spec The object-data
  426. * @param boolean $encapsulate Must set to true, cause the parsing and fpdi use this method only without this para
  427. */
  428. function pdf_resolve_object(&$c, $obj_spec, $encapsulate = true) {
  429. // Exit if we get invalid data
  430. if (!is_array($obj_spec)) {
  431. $ret = false;
  432. return $ret;
  433. }
  434. if ($obj_spec[0] == PDF_TYPE_OBJREF) {
  435. // This is a reference, resolve it
  436. if (isset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]])) {
  437. // Save current file position
  438. // This is needed if you want to resolve
  439. // references while you're reading another object
  440. // (e.g.: if you need to determine the length
  441. // of a stream)
  442. $old_pos = ftell($c->file);
  443. // Reposition the file pointer and
  444. // load the object header.
  445. $c->reset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]]);
  446. $header = $this->pdf_read_value($c);
  447. if ($header[0] != PDF_TYPE_OBJDEC || $header[1] != $obj_spec[1] || $header[2] != $obj_spec[2]) {
  448. $toSearchFor = $obj_spec[1] . ' ' . $obj_spec[2] . ' obj';
  449. if (preg_match('/' . $toSearchFor . '/', $c->buffer)) {
  450. $c->offset = strpos($c->buffer, $toSearchFor) + strlen($toSearchFor);
  451. // reset stack
  452. $c->stack = array();
  453. } else {
  454. $this->error("Unable to find object ({$obj_spec[1]}, {$obj_spec[2]}) at expected location");
  455. }
  456. }
  457. // If we're being asked to store all the information
  458. // about the object, we add the object ID and generation
  459. // number for later use
  460. $result = array();
  461. $this->actual_obj =& $result;
  462. if ($encapsulate) {
  463. $result = array (
  464. PDF_TYPE_OBJECT,
  465. 'obj' => $obj_spec[1],
  466. 'gen' => $obj_spec[2]
  467. );
  468. }
  469. // Now simply read the object data until
  470. // we encounter an end-of-object marker
  471. while(1) {
  472. $value = $this->pdf_read_value($c);
  473. if ($value === false || count($result) > 4) {
  474. // in this case the parser coudn't find an endobj so we break here
  475. break;
  476. }
  477. if ($value[0] == PDF_TYPE_TOKEN && $value[1] === 'endobj') {
  478. break;
  479. }
  480. $result[] = $value;
  481. }
  482. $c->reset($old_pos);
  483. if (isset($result[2][0]) && $result[2][0] == PDF_TYPE_STREAM) {
  484. $result[0] = PDF_TYPE_STREAM;
  485. }
  486. return $result;
  487. }
  488. } else {
  489. return $obj_spec;
  490. }
  491. }
  492. /**
  493. * Reads a token from the file
  494. *
  495. * @param object $c pdf_context
  496. * @return mixed
  497. */
  498. function pdf_read_token(&$c)
  499. {
  500. // If there is a token available
  501. // on the stack, pop it out and
  502. // return it.
  503. if (count($c->stack)) {
  504. return array_pop($c->stack);
  505. }
  506. // Strip away any whitespace
  507. do {
  508. if (!$c->ensure_content()) {
  509. return false;
  510. }
  511. $c->offset += strspn($c->buffer, " \n\r\t", $c->offset);
  512. } while ($c->offset >= $c->length - 1);
  513. // Get the first character in the stream
  514. $char = $c->buffer[$c->offset++];
  515. switch ($char) {
  516. case '[':
  517. case ']':
  518. case '(':
  519. case ')':
  520. // This is either an array or literal string
  521. // delimiter, Return it
  522. return $char;
  523. case '<':
  524. case '>':
  525. // This could either be a hex string or
  526. // dictionary delimiter. Determine the
  527. // appropriate case and return the token
  528. if ($c->buffer[$c->offset] == $char) {
  529. if (!$c->ensure_content()) {
  530. return false;
  531. }
  532. $c->offset++;
  533. return $char . $char;
  534. } else {
  535. return $char;
  536. }
  537. case '%':
  538. // This is a comment - jump over it!
  539. $pos = $c->offset;
  540. while(1) {
  541. $match = preg_match("/(\r\n|\r|\n)/", $c->buffer, $m, PREG_OFFSET_CAPTURE, $pos);
  542. if ($match === 0) {
  543. if (!$c->increase_length()) {
  544. return false;
  545. } else {
  546. continue;
  547. }
  548. }
  549. $c->offset = $m[0][1]+strlen($m[0][0]);
  550. return $this->pdf_read_token($c);
  551. }
  552. default:
  553. // This is "another" type of token (probably
  554. // a dictionary entry or a numeric value)
  555. // Find the end and return it.
  556. if (!$c->ensure_content()) {
  557. return false;
  558. }
  559. while(1) {
  560. // Determine the length of the token
  561. $pos = strcspn($c->buffer, " %[]<>()\r\n\t/", $c->offset);
  562. if ($c->offset + $pos <= $c->length - 1) {
  563. break;
  564. } else {
  565. // If the script reaches this point,
  566. // the token may span beyond the end
  567. // of the current buffer. Therefore,
  568. // we increase the size of the buffer
  569. // and try again--just to be safe.
  570. $c->increase_length();
  571. }
  572. }
  573. $result = substr($c->buffer, $c->offset - 1, $pos + 1);
  574. $c->offset += $pos;
  575. return $result;
  576. }
  577. }
  578. }
  579. }