/vendor/smalot/pdfparser/src/Smalot/PdfParser/Parser.php
PHP | 313 lines | 179 code | 56 blank | 78 comment | 14 complexity | 377efc8fb54f5486df691c8bd9c3f0b0 MD5 | raw file
- <?php
- /**
- * @file
- * This file is part of the PdfParser library.
- *
- * @author Sébastien MALOT <sebastien@malot.fr>
- * @date 2013-08-08
- * @license GPL-3.0
- * @url <https://github.com/smalot/pdfparser>
- *
- * PdfParser is a pdf library written in PHP, extraction oriented.
- * Copyright (C) 2014 - Sébastien MALOT <sebastien@malot.fr>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.
- * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
- *
- */
- namespace Smalot\PdfParser;
- use Smalot\PdfParser\Element\ElementArray;
- use Smalot\PdfParser\Element\ElementBoolean;
- use Smalot\PdfParser\Element\ElementDate;
- use Smalot\PdfParser\Element\ElementHexa;
- use Smalot\PdfParser\Element\ElementName;
- use Smalot\PdfParser\Element\ElementNull;
- use Smalot\PdfParser\Element\ElementNumeric;
- use Smalot\PdfParser\Element\ElementString;
- use Smalot\PdfParser\Element\ElementXRef;
- /**
- * Class Parser
- *
- * @package Smalot\PdfParser
- */
- class Parser
- {
- /**
- * @var Object[]
- */
- protected $objects = array();
- /**
- *
- */
- public function __construct()
- {
- }
- /**
- * Parse PDF file
- *
- * @param string $filename
- *
- * @return Document
- */
- public function parseFile($filename)
- {
- $content = file_get_contents($filename);
- return @$this->parseContent($content);
- }
- /**
- * Parse PDF content
- *
- * @param string $content
- *
- * @return Document
- */
- public function parseContent($content)
- {
- // Create structure using TCPDF Parser.
- ob_start();
- @$parser = new \TCPDF_PARSER(ltrim($content));
- list($xref, $data) = $parser->getParsedData();
- unset($parser);
- ob_end_clean();
- if (isset($xref['trailer']['encrypt'])) {
- throw new \Exception('Secured pdf file are currently not supported.');
- }
- if (empty($data)) {
- throw new \Exception('Object list not found. Possible secured file.');
- }
- // Create destination object.
- $document = new Document();
- $this->objects = array();
- foreach ($data as $id => $structure) {
- $this->parseObject($id, $structure, $document);
- unset($data[$id]);
- }
- $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
- $document->setObjects($this->objects);
- return $document;
- }
- protected function parseTrailer($structure, $document)
- {
- $trailer = array();
- foreach ($structure as $name => $values) {
- $name = ucfirst($name);
- if (is_numeric($values)) {
- $trailer[$name] = new ElementNumeric($values, $document);
- } elseif (is_array($values)) {
- $value = $this->parseTrailer($values, null);
- $trailer[$name] = new ElementArray($value, null);
- } elseif (strpos($values, '_') !== false) {
- $trailer[$name] = new ElementXRef($values, $document);
- } else {
- $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
- }
- }
- return new Header($trailer, $document);
- }
- /**
- * @param string $id
- * @param array $structure
- * @param Document $document
- */
- protected function parseObject($id, $structure, $document)
- {
- $header = new Header(array(), $document);
- $content = '';
- foreach ($structure as $position => $part) {
- switch ($part[0]) {
- case '[':
- $elements = array();
- foreach ($part[1] as $sub_element) {
- $sub_type = $sub_element[0];
- $sub_value = $sub_element[1];
- $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
- }
- $header = new Header($elements, $document);
- break;
- case '<<':
- $header = $this->parseHeader($part[1], $document);
- break;
- case 'stream':
- $content = isset($part[3][0]) ? $part[3][0] : $part[1];
- if ($header->get('Type')->equals('ObjStm')) {
- $match = array();
- // Split xrefs and contents.
- preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
- $content = $match[3];
- // Extract xrefs.
- $xrefs = preg_split(
- '/(\d+\s+\d+\s*)/s',
- $match[1],
- -1,
- PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
- );
- $table = array();
- foreach ($xrefs as $xref) {
- list($id, $position) = explode(' ', trim($xref));
- $table[$position] = $id;
- }
- ksort($table);
- $ids = array_values($table);
- $positions = array_keys($table);
- foreach ($positions as $index => $position) {
- $id = $ids[$index] . '_0';
- $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : strlen($content);
- $sub_content = substr($content, $position, $next_position - $position);
- $sub_header = Header::parse($sub_content, $document);
- $object = Object::factory($document, $sub_header, '');
- $this->objects[$id] = $object;
- }
- // It is not necessary to store this content.
- $content = '';
- return;
- }
- break;
- default:
- if ($part != 'null') {
- $element = $this->parseHeaderElement($part[0], $part[1], $document);
- if ($element) {
- $header = new Header(array($element), $document);
- }
- }
- break;
- }
- }
- if (!isset($this->objects[$id])) {
- $this->objects[$id] = Object::factory($document, $header, $content);
- }
- }
- /**
- * @param array $structure
- * @param Document $document
- *
- * @return Header
- * @throws \Exception
- */
- protected function parseHeader($structure, $document)
- {
- $elements = array();
- $count = count($structure);
- for ($position = 0; $position < $count; $position += 2) {
- $name = $structure[$position][1];
- $type = $structure[$position + 1][0];
- $value = $structure[$position + 1][1];
- $elements[$name] = $this->parseHeaderElement($type, $value, $document);
- }
- return new Header($elements, $document);
- }
- /**
- * @param $type
- * @param $value
- * @param $document
- *
- * @return Element|Header
- * @throws \Exception
- */
- protected function parseHeaderElement($type, $value, $document)
- {
- switch ($type) {
- case '<<':
- return $this->parseHeader($value, $document);
- case 'numeric':
- return new ElementNumeric($value, $document);
- case 'boolean':
- return new ElementBoolean($value, $document);
- case 'null':
- return new ElementNull($value, $document);
- case '(':
- if ($date = ElementDate::parse('(' . $value . ')', $document)) {
- return $date;
- } else {
- return ElementString::parse('(' . $value . ')', $document);
- }
- case '<':
- return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
- case '/':
- return ElementName::parse('/' . $value, $document);
- case 'ojbref': // old mistake in tcpdf parser
- case 'objref':
- return new ElementXRef($value, $document);
- case '[':
- $values = array();
- foreach ($value as $sub_element) {
- $sub_type = $sub_element[0];
- $sub_value = $sub_element[1];
- $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
- }
- return new ElementArray($values, $document);
- case 'endstream':
- case '':
- // Nothing to do with.
- break;
- default:
- throw new \Exception('Invalid type: "' . $type . '".');
- }
- }
- }