PageRenderTime 51ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/tcpdf/tcpdf_parser.php

https://gitlab.com/staging06/myproject
PHP | 511 lines | 340 code | 18 blank | 153 comment | 57 complexity | 91f0d4ad0fed09f2b2ceb5b1024ef289 MD5 | raw file
  1. <?php
  2. //============================================================+
  3. // File name : tcpdf_parser.php
  4. // Version : 1.0.001
  5. // Begin : 2011-05-23
  6. // Last Update : 2012-05-03
  7. // Author : Nicola Asuni - Tecnick.com LTD - Manor Coach House, Church Hill, Aldershot, Hants, GU12 4RQ, UK - www.tecnick.com - info@tecnick.com
  8. // License : http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT GNU-LGPLv3
  9. // -------------------------------------------------------------------
  10. // Copyright (C) 2011-2012 Nicola Asuni - Tecnick.com LTD
  11. //
  12. // This file is part of TCPDF software library.
  13. //
  14. // TCPDF is free software: you can redistribute it and/or modify it
  15. // under the terms of the GNU Lesser General Public License as
  16. // published by the Free Software Foundation, either version 3 of the
  17. // License, or (at your option) any later version.
  18. //
  19. // TCPDF is distributed in the hope that it will be useful, but
  20. // WITHOUT ANY WARRANTY; without even the implied warranty of
  21. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22. // See the GNU Lesser General Public License for more details.
  23. //
  24. // You should have received a copy of the License
  25. // along with TCPDF. If not, see
  26. // <http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT>.
  27. //
  28. // See LICENSE.TXT file for more information.
  29. // -------------------------------------------------------------------
  30. //
  31. // Description : This is a PHP class for parsing PDF documents.
  32. //
  33. //============================================================+
  34. /**
  35. * @file
  36. * This is a PHP class for parsing PDF documents.<br>
  37. * @package com.tecnick.tcpdf
  38. * @author Nicola Asuni
  39. * @version 1.0.001
  40. */
  41. // include class for decoding filters
  42. require_once(dirname(__FILE__).'/tcpdf_filters.php');
  43. /**
  44. * @class TCPDF_PARSER
  45. * This is a PHP class for parsing PDF documents.<br>
  46. * @package com.tecnick.tcpdf
  47. * @brief This is a PHP class for parsing PDF documents..
  48. * @version 1.0.001
  49. * @author Nicola Asuni - info@tecnick.com
  50. */
  51. class TCPDF_PARSER {
  52. /**
  53. * Raw content of the PDF document.
  54. * @private
  55. */
  56. private $pdfdata = '';
  57. /**
  58. * XREF data.
  59. * @protected
  60. */
  61. protected $xref = array();
  62. /**
  63. * Array of PDF objects.
  64. * @protected
  65. */
  66. protected $objects = array();
  67. /**
  68. * Class object for decoding filters.
  69. * @private
  70. */
  71. private $FilterDecoders;
  72. // -----------------------------------------------------------------------------
  73. /**
  74. * Parse a PDF document an return an array of objects.
  75. * @param $data (string) PDF data to parse.
  76. * @public
  77. * @since 1.0.000 (2011-05-24)
  78. */
  79. public function __construct($data) {
  80. if (empty($data)) {
  81. $this->Error('Empty PDF data.');
  82. }
  83. $this->pdfdata = $data;
  84. // get length
  85. $pdflen = strlen($this->pdfdata);
  86. // initialize class for decoding filters
  87. $this->FilterDecoders = new TCPDF_FILTERS();
  88. // get xref and trailer data
  89. $this->xref = $this->getXrefData();
  90. // parse all document objects
  91. $this->objects = array();
  92. foreach ($this->xref['xref'] as $obj => $offset) {
  93. if (!isset($this->objects[$obj])) {
  94. $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true);
  95. }
  96. }
  97. // release some memory
  98. unset($this->pdfdata);
  99. $this->pdfdata = '';
  100. }
  101. /**
  102. * Return an array of parsed PDF document objects.
  103. * @return (array) Array of parsed PDF document objects.
  104. * @public
  105. * @since 1.0.000 (2011-06-26)
  106. */
  107. public function getParsedData() {
  108. return array($this->xref, $this->objects);
  109. }
  110. /**
  111. * Get xref (cross-reference table) and trailer data from PDF document data.
  112. * @param $offset (int) xref offset (if know).
  113. * @param $xref (array) previous xref array (if any).
  114. * @return Array containing xref and trailer data.
  115. * @protected
  116. * @since 1.0.000 (2011-05-24)
  117. */
  118. protected function getXrefData($offset=0, $xref=array()) {
  119. if ($offset == 0) {
  120. // find last startxref
  121. if (preg_match_all('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_SET_ORDER, $offset) == 0) {
  122. $this->Error('Unable to find startxref');
  123. }
  124. $matches = array_pop($matches);
  125. $startxref = $matches[1];
  126. } else {
  127. // get the first xref at the specified offset
  128. if (preg_match('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) == 0) {
  129. $this->Error('Unable to find startxref');
  130. }
  131. $startxref = $matches[1][0];
  132. }
  133. // check xref position
  134. if (strpos($this->pdfdata, 'xref', $startxref) != $startxref) {
  135. $this->Error('Unable to find xref');
  136. }
  137. // extract xref data (object indexes and offsets)
  138. $xoffset = $startxref + 5;
  139. // initialize object number
  140. $obj_num = 0;
  141. $offset = $xoffset;
  142. while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
  143. $offset = (strlen($matches[0][0]) + $matches[0][1]);
  144. if ($matches[3][0] == 'n') {
  145. // create unique object index: [object number]_[generation number]
  146. $index = $obj_num.'_'.intval($matches[2][0]);
  147. // check if object already exist
  148. if (!isset($xref['xref'][$index])) {
  149. // store object offset position
  150. $xref['xref'][$index] = intval($matches[1][0]);
  151. }
  152. ++$obj_num;
  153. $offset += 2;
  154. } elseif ($matches[3][0] == 'f') {
  155. ++$obj_num;
  156. $offset += 2;
  157. } else {
  158. // object number (index)
  159. $obj_num = intval($matches[1][0]);
  160. }
  161. }
  162. // get trailer data
  163. if (preg_match('/trailer[\s]*<<(.*)>>[\s]*[\r\n]+startxref[\s]*[\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $xoffset) > 0) {
  164. $trailer_data = $matches[1][0];
  165. if (!isset($xref['trailer'])) {
  166. // get only the last updated version
  167. $xref['trailer'] = array();
  168. // parse trailer_data
  169. if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
  170. $xref['trailer']['size'] = intval($matches[1]);
  171. }
  172. if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  173. $xref['trailer']['root'] = intval($matches[1]).'_'.intval($matches[2]);
  174. }
  175. if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  176. $xref['trailer']['encrypt'] = intval($matches[1]).'_'.intval($matches[2]);
  177. }
  178. if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  179. $xref['trailer']['info'] = intval($matches[1]).'_'.intval($matches[2]);
  180. }
  181. if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
  182. $xref['trailer']['id'] = array();
  183. $xref['trailer']['id'][0] = $matches[1];
  184. $xref['trailer']['id'][1] = $matches[2];
  185. }
  186. }
  187. if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
  188. // get previous xref
  189. $xref = $this->getXrefData(intval($matches[1]), $xref);
  190. }
  191. } else {
  192. $this->Error('Unable to find trailer');
  193. }
  194. return $xref;
  195. }
  196. /**
  197. * Get object type, raw value and offset to next object
  198. * @param $offset (int) Object offset.
  199. * @return array containing object type, raw value and offset to next object
  200. * @protected
  201. * @since 1.0.000 (2011-06-20)
  202. */
  203. protected function getRawObject($offset=0) {
  204. $objtype = ''; // object type to be returned
  205. $objval = ''; // object value to be returned
  206. // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
  207. $offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset);
  208. // get first char
  209. $char = $this->pdfdata{$offset};
  210. // get object type
  211. switch ($char) {
  212. case '%': { // \x25 PERCENT SIGN
  213. // skip comment and search for next token
  214. $next = strcspn($this->pdfdata, "\r\n", $offset);
  215. if ($next > 0) {
  216. $offset += $next;
  217. return $this->getRawObject($this->pdfdata, $offset);
  218. }
  219. break;
  220. }
  221. case '/': { // \x2F SOLIDUS
  222. // name object
  223. $objtype = $char;
  224. ++$offset;
  225. if (preg_match('/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/', substr($this->pdfdata, $offset, 256), $matches) == 1) {
  226. $objval = $matches[1]; // unescaped value
  227. $offset += strlen($objval);
  228. }
  229. break;
  230. }
  231. case '(': // \x28 LEFT PARENTHESIS
  232. case ')': { // \x29 RIGHT PARENTHESIS
  233. // literal string object
  234. $objtype = $char;
  235. ++$offset;
  236. $strpos = $offset;
  237. if ($char == '(') {
  238. $open_bracket = 1;
  239. while ($open_bracket > 0) {
  240. if (!isset($this->pdfdata{$strpos})) {
  241. break;
  242. }
  243. $ch = $this->pdfdata{$strpos};
  244. switch ($ch) {
  245. case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash)
  246. // skip next character
  247. ++$strpos;
  248. break;
  249. }
  250. case '(': { // LEFT PARENHESIS (28h)
  251. ++$open_bracket;
  252. break;
  253. }
  254. case ')': { // RIGHT PARENTHESIS (29h)
  255. --$open_bracket;
  256. break;
  257. }
  258. }
  259. ++$strpos;
  260. }
  261. $objval = substr($this->pdfdata, $offset, ($strpos - $offset - 1));
  262. $offset = $strpos;
  263. }
  264. break;
  265. }
  266. case '[': // \x5B LEFT SQUARE BRACKET
  267. case ']': { // \x5D RIGHT SQUARE BRACKET
  268. // array object
  269. $objtype = $char;
  270. ++$offset;
  271. if ($char == '[') {
  272. // get array content
  273. $objval = array();
  274. do {
  275. // get element
  276. $element = $this->getRawObject($offset);
  277. $offset = $element[2];
  278. $objval[] = $element;
  279. } while ($element[0] != ']');
  280. // remove closing delimiter
  281. array_pop($objval);
  282. }
  283. break;
  284. }
  285. case '<': // \x3C LESS-THAN SIGN
  286. case '>': { // \x3E GREATER-THAN SIGN
  287. if (isset($this->pdfdata{($offset + 1)}) and ($this->pdfdata{($offset + 1)} == $char)) {
  288. // dictionary object
  289. $objtype = $char.$char;
  290. $offset += 2;
  291. if ($char == '<') {
  292. // get array content
  293. $objval = array();
  294. do {
  295. // get element
  296. $element = $this->getRawObject($offset);
  297. $offset = $element[2];
  298. $objval[] = $element;
  299. } while ($element[0] != '>>');
  300. // remove closing delimiter
  301. array_pop($objval);
  302. }
  303. } else {
  304. // hexadecimal string object
  305. $objtype = $char;
  306. ++$offset;
  307. if (($char == '<') and (preg_match('/^([0-9A-Fa-f]+)[>]/iU', substr($this->pdfdata, $offset), $matches) == 1)) {
  308. $objval = $matches[1];
  309. $offset += strlen($matches[0]);
  310. }
  311. }
  312. break;
  313. }
  314. default: {
  315. if (substr($this->pdfdata, $offset, 6) == 'endobj') {
  316. // indirect object
  317. $objtype = 'endobj';
  318. $offset += 6;
  319. } elseif (substr($this->pdfdata, $offset, 4) == 'null') {
  320. // null object
  321. $objtype = 'null';
  322. $offset += 4;
  323. $objval = 'null';
  324. } elseif (substr($this->pdfdata, $offset, 4) == 'true') {
  325. // boolean true object
  326. $objtype = 'boolean';
  327. $offset += 4;
  328. $objval = 'true';
  329. } elseif (substr($this->pdfdata, $offset, 5) == 'false') {
  330. // boolean false object
  331. $objtype = 'boolean';
  332. $offset += 5;
  333. $objval = 'false';
  334. } elseif (substr($this->pdfdata, $offset, 6) == 'stream') {
  335. // start stream object
  336. $objtype = 'stream';
  337. $offset += 6;
  338. if (preg_match('/^[\r\n]+(.*)[\r\n]*endstream/isU', substr($this->pdfdata, $offset), $matches) == 1) {
  339. $objval = $matches[1];
  340. $offset += strlen($matches[0]);
  341. }
  342. } elseif (substr($this->pdfdata, $offset, 9) == 'endstream') {
  343. // end stream object
  344. $objtype = 'endstream';
  345. $offset += 9;
  346. } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
  347. // indirect object reference
  348. $objtype = 'ojbref';
  349. $offset += strlen($matches[0]);
  350. $objval = intval($matches[1]).'_'.intval($matches[2]);
  351. } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
  352. // object start
  353. $objtype = 'ojb';
  354. $objval = intval($matches[1]).'_'.intval($matches[2]);
  355. $offset += strlen($matches[0]);
  356. } elseif (($numlen = strspn($this->pdfdata, '+-.0123456789', $offset)) > 0) {
  357. // numeric object
  358. $objtype = 'numeric';
  359. $objval = substr($this->pdfdata, $offset, $numlen);
  360. $offset += $numlen;
  361. }
  362. break;
  363. }
  364. }
  365. return array($objtype, $objval, $offset);
  366. }
  367. /**
  368. * Get content of indirect object.
  369. * @param $obj_ref (string) Object number and generation number separated by underscore character.
  370. * @param $offset (int) Object offset.
  371. * @param $decoding (boolean) If true decode streams.
  372. * @return array containing object data.
  373. * @protected
  374. * @since 1.0.000 (2011-05-24)
  375. */
  376. protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) {
  377. $obj = explode('_', $obj_ref);
  378. if (($obj === false) or (count($obj) != 2)) {
  379. $this->Error('Invalid object reference: '.$obj);
  380. return;
  381. }
  382. $objref = $obj[0].' '.$obj[1].' obj';
  383. if (strpos($this->pdfdata, $objref, $offset) != $offset) {
  384. // an indirect reference to an undefined object shall be considered a reference to the null object
  385. return array('null', 'null', $offset);
  386. }
  387. // starting position of object content
  388. $offset += strlen($objref);
  389. // get array of object content
  390. $objdata = array();
  391. $i = 0; // object main index
  392. do {
  393. // get element
  394. $element = $this->getRawObject($offset);
  395. $offset = $element[2];
  396. // decode stream using stream's dictionary information
  397. if ($decoding and ($element[0] == 'stream') and (isset($objdata[($i - 1)][0])) and ($objdata[($i - 1)][0] == '<<')) {
  398. $element[3] = $this->decodeStream($objdata[($i - 1)][1], substr($element[1], 1));
  399. }
  400. $objdata[$i] = $element;
  401. ++$i;
  402. } while ($element[0] != 'endobj');
  403. // remove closing delimiter
  404. array_pop($objdata);
  405. // return raw object content
  406. return $objdata;
  407. }
  408. /**
  409. * Get the content of object, resolving indect object reference if necessary.
  410. * @param $obj (string) Object value.
  411. * @return array containing object data.
  412. * @protected
  413. * @since 1.0.000 (2011-06-26)
  414. */
  415. protected function getObjectVal($obj) {
  416. if ($obj[0] == 'objref') {
  417. // reference to indirect object
  418. if (isset($this->objects[$obj[1]])) {
  419. // this object has been already parsed
  420. return $this->objects[$obj[1]];
  421. } elseif (isset($this->xref[$obj[1]])) {
  422. // parse new object
  423. $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false);
  424. return $this->objects[$obj[1]];
  425. }
  426. }
  427. return $obj;
  428. }
  429. /**
  430. * Decode the specified stream.
  431. * @param $sdic (array) Stream's dictionary array.
  432. * @param $stream (string) Stream to decode.
  433. * @return array containing decoded stream data and remaining filters.
  434. * @protected
  435. * @since 1.0.000 (2011-06-22)
  436. */
  437. protected function decodeStream($sdic, $stream) {
  438. // get stream lenght and filters
  439. $slength = strlen($stream);
  440. $filters = array();
  441. foreach ($sdic as $k => $v) {
  442. if ($v[0] == '/') {
  443. if (($v[1] == 'Length') and (isset($sdic[($k + 1)])) and ($sdic[($k + 1)][0] == 'numeric')) {
  444. // get declared stream lenght
  445. $declength = intval($sdic[($k + 1)][1]);
  446. if ($declength < $slength) {
  447. $stream = substr($stream, 0, $declength);
  448. $slength = $declength;
  449. }
  450. } elseif (($v[1] == 'Filter') and (isset($sdic[($k + 1)]))) {
  451. // resolve indirect object
  452. $objval = $this->getObjectVal($sdic[($k + 1)]);
  453. if ($objval[0] == '/') {
  454. // single filter
  455. $filters[] = $objval[1];
  456. } elseif ($objval[0] == '[') {
  457. // array of filters
  458. foreach ($objval[1] as $flt) {
  459. if ($flt[0] == '/') {
  460. $filters[] = $flt[1];
  461. }
  462. }
  463. }
  464. }
  465. }
  466. }
  467. // decode the stream
  468. $remaining_filters = array();
  469. foreach ($filters as $filter) {
  470. if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) {
  471. $stream = $this->FilterDecoders->decodeFilter($filter, $stream);
  472. } else {
  473. // add missing filter to array
  474. $remaining_filters[] = $filter;
  475. }
  476. }
  477. return array($stream, $remaining_filters);
  478. }
  479. /**
  480. * This method is automatically called in case of fatal error; it simply outputs the message and halts the execution.
  481. * @param $msg (string) The error message
  482. * @public
  483. * @since 1.0.000 (2011-05-23)
  484. */
  485. public function Error($msg) {
  486. // exit program and print error
  487. die('<strong>TCPDF_PARSER ERROR: </strong>'.$msg);
  488. }
  489. } // END OF TCPDF_PARSER CLASS
  490. //============================================================+
  491. // END OF FILE
  492. //============================================================+