PageRenderTime 56ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/smalot/pdfparser/src/Smalot/PdfParser/Parser.php

https://gitlab.com/lauroPereira/aula_android_webservice
PHP | 313 lines | 179 code | 56 blank | 78 comment | 14 complexity | 377efc8fb54f5486df691c8bd9c3f0b0 MD5 | raw file
  1. <?php
  2. /**
  3. * @file
  4. * This file is part of the PdfParser library.
  5. *
  6. * @author Sébastien MALOT <sebastien@malot.fr>
  7. * @date 2013-08-08
  8. * @license GPL-3.0
  9. * @url <https://github.com/smalot/pdfparser>
  10. *
  11. * PdfParser is a pdf library written in PHP, extraction oriented.
  12. * Copyright (C) 2014 - Sébastien MALOT <sebastien@malot.fr>
  13. *
  14. * This program is free software: you can redistribute it and/or modify
  15. * it under the terms of the GNU General Public License as published by
  16. * the Free Software Foundation, either version 3 of the License, or
  17. * (at your option) any later version.
  18. *
  19. * This program is distributed in the hope that it will be useful,
  20. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  22. * GNU General Public License for more details.
  23. *
  24. * You should have received a copy of the GNU General Public License
  25. * along with this program.
  26. * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
  27. *
  28. */
  29. namespace Smalot\PdfParser;
  30. use Smalot\PdfParser\Element\ElementArray;
  31. use Smalot\PdfParser\Element\ElementBoolean;
  32. use Smalot\PdfParser\Element\ElementDate;
  33. use Smalot\PdfParser\Element\ElementHexa;
  34. use Smalot\PdfParser\Element\ElementName;
  35. use Smalot\PdfParser\Element\ElementNull;
  36. use Smalot\PdfParser\Element\ElementNumeric;
  37. use Smalot\PdfParser\Element\ElementString;
  38. use Smalot\PdfParser\Element\ElementXRef;
  39. /**
  40. * Class Parser
  41. *
  42. * @package Smalot\PdfParser
  43. */
  44. class Parser
  45. {
  46. /**
  47. * @var Object[]
  48. */
  49. protected $objects = array();
  50. /**
  51. *
  52. */
  53. public function __construct()
  54. {
  55. }
  56. /**
  57. * Parse PDF file
  58. *
  59. * @param string $filename
  60. *
  61. * @return Document
  62. */
  63. public function parseFile($filename)
  64. {
  65. $content = file_get_contents($filename);
  66. return @$this->parseContent($content);
  67. }
  68. /**
  69. * Parse PDF content
  70. *
  71. * @param string $content
  72. *
  73. * @return Document
  74. */
  75. public function parseContent($content)
  76. {
  77. // Create structure using TCPDF Parser.
  78. ob_start();
  79. @$parser = new \TCPDF_PARSER(ltrim($content));
  80. list($xref, $data) = $parser->getParsedData();
  81. unset($parser);
  82. ob_end_clean();
  83. if (isset($xref['trailer']['encrypt'])) {
  84. throw new \Exception('Secured pdf file are currently not supported.');
  85. }
  86. if (empty($data)) {
  87. throw new \Exception('Object list not found. Possible secured file.');
  88. }
  89. // Create destination object.
  90. $document = new Document();
  91. $this->objects = array();
  92. foreach ($data as $id => $structure) {
  93. $this->parseObject($id, $structure, $document);
  94. unset($data[$id]);
  95. }
  96. $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
  97. $document->setObjects($this->objects);
  98. return $document;
  99. }
  100. protected function parseTrailer($structure, $document)
  101. {
  102. $trailer = array();
  103. foreach ($structure as $name => $values) {
  104. $name = ucfirst($name);
  105. if (is_numeric($values)) {
  106. $trailer[$name] = new ElementNumeric($values, $document);
  107. } elseif (is_array($values)) {
  108. $value = $this->parseTrailer($values, null);
  109. $trailer[$name] = new ElementArray($value, null);
  110. } elseif (strpos($values, '_') !== false) {
  111. $trailer[$name] = new ElementXRef($values, $document);
  112. } else {
  113. $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
  114. }
  115. }
  116. return new Header($trailer, $document);
  117. }
  118. /**
  119. * @param string $id
  120. * @param array $structure
  121. * @param Document $document
  122. */
  123. protected function parseObject($id, $structure, $document)
  124. {
  125. $header = new Header(array(), $document);
  126. $content = '';
  127. foreach ($structure as $position => $part) {
  128. switch ($part[0]) {
  129. case '[':
  130. $elements = array();
  131. foreach ($part[1] as $sub_element) {
  132. $sub_type = $sub_element[0];
  133. $sub_value = $sub_element[1];
  134. $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
  135. }
  136. $header = new Header($elements, $document);
  137. break;
  138. case '<<':
  139. $header = $this->parseHeader($part[1], $document);
  140. break;
  141. case 'stream':
  142. $content = isset($part[3][0]) ? $part[3][0] : $part[1];
  143. if ($header->get('Type')->equals('ObjStm')) {
  144. $match = array();
  145. // Split xrefs and contents.
  146. preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
  147. $content = $match[3];
  148. // Extract xrefs.
  149. $xrefs = preg_split(
  150. '/(\d+\s+\d+\s*)/s',
  151. $match[1],
  152. -1,
  153. PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
  154. );
  155. $table = array();
  156. foreach ($xrefs as $xref) {
  157. list($id, $position) = explode(' ', trim($xref));
  158. $table[$position] = $id;
  159. }
  160. ksort($table);
  161. $ids = array_values($table);
  162. $positions = array_keys($table);
  163. foreach ($positions as $index => $position) {
  164. $id = $ids[$index] . '_0';
  165. $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : strlen($content);
  166. $sub_content = substr($content, $position, $next_position - $position);
  167. $sub_header = Header::parse($sub_content, $document);
  168. $object = Object::factory($document, $sub_header, '');
  169. $this->objects[$id] = $object;
  170. }
  171. // It is not necessary to store this content.
  172. $content = '';
  173. return;
  174. }
  175. break;
  176. default:
  177. if ($part != 'null') {
  178. $element = $this->parseHeaderElement($part[0], $part[1], $document);
  179. if ($element) {
  180. $header = new Header(array($element), $document);
  181. }
  182. }
  183. break;
  184. }
  185. }
  186. if (!isset($this->objects[$id])) {
  187. $this->objects[$id] = Object::factory($document, $header, $content);
  188. }
  189. }
  190. /**
  191. * @param array $structure
  192. * @param Document $document
  193. *
  194. * @return Header
  195. * @throws \Exception
  196. */
  197. protected function parseHeader($structure, $document)
  198. {
  199. $elements = array();
  200. $count = count($structure);
  201. for ($position = 0; $position < $count; $position += 2) {
  202. $name = $structure[$position][1];
  203. $type = $structure[$position + 1][0];
  204. $value = $structure[$position + 1][1];
  205. $elements[$name] = $this->parseHeaderElement($type, $value, $document);
  206. }
  207. return new Header($elements, $document);
  208. }
  209. /**
  210. * @param $type
  211. * @param $value
  212. * @param $document
  213. *
  214. * @return Element|Header
  215. * @throws \Exception
  216. */
  217. protected function parseHeaderElement($type, $value, $document)
  218. {
  219. switch ($type) {
  220. case '<<':
  221. return $this->parseHeader($value, $document);
  222. case 'numeric':
  223. return new ElementNumeric($value, $document);
  224. case 'boolean':
  225. return new ElementBoolean($value, $document);
  226. case 'null':
  227. return new ElementNull($value, $document);
  228. case '(':
  229. if ($date = ElementDate::parse('(' . $value . ')', $document)) {
  230. return $date;
  231. } else {
  232. return ElementString::parse('(' . $value . ')', $document);
  233. }
  234. case '<':
  235. return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
  236. case '/':
  237. return ElementName::parse('/' . $value, $document);
  238. case 'ojbref': // old mistake in tcpdf parser
  239. case 'objref':
  240. return new ElementXRef($value, $document);
  241. case '[':
  242. $values = array();
  243. foreach ($value as $sub_element) {
  244. $sub_type = $sub_element[0];
  245. $sub_value = $sub_element[1];
  246. $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
  247. }
  248. return new ElementArray($values, $document);
  249. case 'endstream':
  250. case '':
  251. // Nothing to do with.
  252. break;
  253. default:
  254. throw new \Exception('Invalid type: "' . $type . '".');
  255. }
  256. }
  257. }