PageRenderTime 68ms CodeModel.GetById 34ms app.highlight 13ms RepoModel.GetById 17ms app.codeStats 0ms

/Pdf/FileParser.php

https://bitbucket.org/gkawka/zend-framework
PHP | 485 lines | 186 code | 43 blank | 256 comment | 52 complexity | 167772665fc109bbd977818121513a3f MD5 | raw file
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Pdf
 17 * @subpackage FileParser
 18 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 * @version    $Id: FileParser.php 24593 2012-01-05 20:35:02Z matthew $
 21 */
 22
 23/**
 24 * Abstract utility class for parsing binary files.
 25 *
 26 * Provides a library of methods to quickly navigate and extract various data
 27 * types (signed and unsigned integers, floating- and fixed-point numbers,
 28 * strings, etc.) from the file.
 29 *
 30 * File access is managed via a {@link Zend_Pdf_FileParserDataSource} object.
 31 * This allows the same parser code to work with many different data sources:
 32 * in-memory objects, filesystem files, etc.
 33 *
 34 * @package    Zend_Pdf
 35 * @subpackage FileParser
 36 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 37 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 38 */
 39abstract class Zend_Pdf_FileParser
 40{
 41  /**** Class Constants ****/
 42
 43    /**
 44     * Little-endian byte order (0x04 0x03 0x02 0x01).
 45     */
 46    const BYTE_ORDER_LITTLE_ENDIAN = 0;
 47
 48    /**
 49     * Big-endian byte order (0x01 0x02 0x03 0x04).
 50     */
 51    const BYTE_ORDER_BIG_ENDIAN    = 1;
 52
 53
 54
 55  /**** Instance Variables ****/
 56
 57
 58    /**
 59     * Flag indicating that the file has passed a cursory validation check.
 60     * @var boolean
 61     */
 62    protected $_isScreened = false;
 63
 64    /**
 65     * Flag indicating that the file has been sucessfully parsed.
 66     * @var boolean
 67     */
 68    protected $_isParsed = false;
 69
 70    /**
 71     * Object representing the data source to be parsed.
 72     * @var Zend_Pdf_FileParserDataSource
 73     */
 74    protected $_dataSource = null;
 75
 76
 77
 78  /**** Public Interface ****/
 79
 80
 81  /* Abstract Methods */
 82
 83    /**
 84     * Performs a cursory check to verify that the binary file is in the expected
 85     * format. Intended to quickly weed out obviously bogus files.
 86     *
 87     * Must set $this->_isScreened to true if successful.
 88     *
 89     * @throws Zend_Pdf_Exception
 90     */
 91    abstract public function screen();
 92
 93    /**
 94     * Reads and parses the complete binary file.
 95     *
 96     * Must set $this->_isParsed to true if successful.
 97     *
 98     * @throws Zend_Pdf_Exception
 99     */
100    abstract public function parse();
101
102
103  /* Object Lifecycle */
104
105    /**
106     * Object constructor.
107     *
108     * Verifies that the data source has been properly initialized.
109     *
110     * @param Zend_Pdf_FileParserDataSource $dataSource
111     * @throws Zend_Pdf_Exception
112     */
113    public function __construct(Zend_Pdf_FileParserDataSource $dataSource)
114    {
115        if ($dataSource->getSize() == 0) {
116            require_once 'Zend/Pdf/Exception.php';
117            throw new Zend_Pdf_Exception('The data source has not been properly initialized',
118                                         Zend_Pdf_Exception::BAD_DATA_SOURCE);
119        }
120        $this->_dataSource = $dataSource;
121    }
122
123    /**
124     * Object destructor.
125     *
126     * Discards the data source object.
127     */
128    public function __destruct()
129    {
130        $this->_dataSource = null;
131    }
132
133
134  /* Accessors */
135
136    /**
137     * Returns true if the file has passed a cursory validation check.
138     *
139     * @return boolean
140     */
141    public function isScreened()
142    {
143        return $this->_isScreened;
144    }
145
146    /**
147     * Returns true if the file has been successfully parsed.
148     *
149     * @return boolean
150     */
151    public function isParsed()
152    {
153        return $this->_isParsed;
154    }
155
156    /**
157     * Returns the data source object representing the file being parsed.
158     *
159     * @return Zend_Pdf_FileParserDataSource
160     */
161    public function getDataSource()
162    {
163        return $this->_dataSource;
164    }
165
166
167  /* Primitive Methods */
168
169    /**
170     * Convenience wrapper for the data source object's moveToOffset() method.
171     *
172     * @param integer $offset Destination byte offset.
173     * @throws Zend_Pdf_Exception
174     */
175    public function moveToOffset($offset)
176    {
177        $this->_dataSource->moveToOffset($offset);
178    }
179
180    public function getOffset() {
181       return $this->_dataSource->getOffset();
182    }
183
184    public function getSize() {
185       return $this->_dataSource->getSize();
186    }
187
188    /**
189     * Convenience wrapper for the data source object's readBytes() method.
190     *
191     * @param integer $byteCount Number of bytes to read.
192     * @return string
193     * @throws Zend_Pdf_Exception
194     */
195    public function readBytes($byteCount)
196    {
197        return $this->_dataSource->readBytes($byteCount);
198    }
199
200    /**
201     * Convenience wrapper for the data source object's skipBytes() method.
202     *
203     * @param integer $byteCount Number of bytes to skip.
204     * @throws Zend_Pdf_Exception
205     */
206    public function skipBytes($byteCount)
207    {
208        $this->_dataSource->skipBytes($byteCount);
209    }
210
211
212  /* Parser Methods */
213
214    /**
215     * Reads the signed integer value from the binary file at the current byte
216     * offset.
217     *
218     * Advances the offset by the number of bytes read. Throws an exception if
219     * an error occurs.
220     *
221     * @param integer $size Size of integer in bytes: 1-4
222     * @param integer $byteOrder (optional) Big- or little-endian byte order.
223     *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
224     *   If omitted, uses big-endian.
225     * @return integer
226     * @throws Zend_Pdf_Exception
227     */
228    public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
229    {
230        if (($size < 1) || ($size > 4)) {
231            require_once 'Zend/Pdf/Exception.php';
232            throw new Zend_Pdf_Exception("Invalid signed integer size: $size",
233                                         Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
234        }
235        $bytes = $this->_dataSource->readBytes($size);
236        /* unpack() will not work for this method because it always works in
237         * the host byte order for signed integers. It also does not allow for
238         * variable integer sizes.
239         */
240        if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
241            $number = ord($bytes[0]);
242            if (($number & 0x80) == 0x80) {
243                /* This number is negative. Extract the positive equivalent.
244                 */
245                $number = (~ $number) & 0xff;
246                for ($i = 1; $i < $size; $i++) {
247                    $number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff);
248                }
249                /* Now turn this back into a negative number by taking the
250                 * two's complement (we didn't add one above so won't
251                 * subtract it below). This works reliably on both 32- and
252                 * 64-bit systems.
253                 */
254                $number = ~$number;
255            } else {
256                for ($i = 1; $i < $size; $i++) {
257                    $number = ($number << 8) | ord($bytes[$i]);
258                }
259            }
260        } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
261            $number = ord($bytes[$size - 1]);
262            if (($number & 0x80) == 0x80) {
263                /* Negative number. See discussion above.
264                 */
265                $number = 0;
266                for ($i = --$size; $i >= 0; $i--) {
267                    $number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8);
268                }
269                $number = ~$number;
270            } else {
271                $number = 0;
272                for ($i = --$size; $i >= 0; $i--) {
273                    $number |= ord($bytes[$i]) << ($i * 8);
274                }
275            }
276        } else {
277            require_once 'Zend/Pdf/Exception.php';
278            throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
279                                         Zend_Pdf_Exception::INVALID_BYTE_ORDER);
280        }
281        return $number;
282    }
283
284    /**
285     * Reads the unsigned integer value from the binary file at the current byte
286     * offset.
287     *
288     * Advances the offset by the number of bytes read. Throws an exception if
289     * an error occurs.
290     *
291     * NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the
292     * resulting value WILL BE SIGNED because PHP uses signed integers internally
293     * for everything. To guarantee portability, be sure to use bitwise operators
294     * operators on large unsigned integers!
295     *
296     * @param integer $size Size of integer in bytes: 1-4
297     * @param integer $byteOrder (optional) Big- or little-endian byte order.
298     *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
299     *   If omitted, uses big-endian.
300     * @return integer
301     * @throws Zend_Pdf_Exception
302     */
303    public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
304    {
305        if (($size < 1) || ($size > 4)) {
306            require_once 'Zend/Pdf/Exception.php';
307            throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size",
308                                         Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
309        }
310        $bytes = $this->_dataSource->readBytes($size);
311        /* unpack() is a bit heavyweight for this simple conversion. Just
312         * work the bytes directly.
313         */
314        if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
315            $number = ord($bytes[0]);
316            for ($i = 1; $i < $size; $i++) {
317                $number = ($number << 8) | ord($bytes[$i]);
318            }
319        } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
320            $number = 0;
321            for ($i = --$size; $i >= 0; $i--) {
322                $number |= ord($bytes[$i]) << ($i * 8);
323            }
324        } else {
325            require_once 'Zend/Pdf/Exception.php';
326            throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
327                                         Zend_Pdf_Exception::INVALID_BYTE_ORDER);
328        }
329        return $number;
330    }
331
332    /**
333     * Returns true if the specified bit is set in the integer bitfield.
334     *
335     * @param integer $bit Bit number to test (i.e. - 0-31)
336     * @param integer $bitField
337     * @return boolean
338     */
339    public function isBitSet($bit, $bitField)
340    {
341        $bitMask = 1 << $bit;
342        $isSet = (($bitField & $bitMask) == $bitMask);
343        return $isSet;
344    }
345
346    /**
347     * Reads the signed fixed-point number from the binary file at the current
348     * byte offset.
349     *
350     * Common fixed-point sizes are 2.14 and 16.16.
351     *
352     * Advances the offset by the number of bytes read. Throws an exception if
353     * an error occurs.
354     *
355     * @param integer $mantissaBits Number of bits in the mantissa
356     * @param integer $fractionBits Number of bits in the fraction
357     * @param integer $byteOrder (optional) Big- or little-endian byte order.
358     *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
359     *   If omitted, uses big-endian.
360     * @return float
361     * @throws Zend_Pdf_Exception
362     */
363    public function readFixed($mantissaBits, $fractionBits,
364                              $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
365    {
366        $bitsToRead = $mantissaBits + $fractionBits;
367        if (($bitsToRead % 8) !== 0) {
368            require_once 'Zend/Pdf/Exception.php';
369            throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes',
370                                         Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE);
371        }
372        $number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits);
373        return $number;
374    }
375
376    /**
377     * Reads the Unicode UTF-16-encoded string from the binary file at the
378     * current byte offset.
379     *
380     * The byte order of the UTF-16 string must be specified. You must also
381     * supply the desired resulting character set.
382     *
383     * Advances the offset by the number of bytes read. Throws an exception if
384     * an error occurs.
385     *
386     * @todo Consider changing $byteCount to a character count. They are not
387     *   always equivalent (in the case of surrogates).
388     * @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the
389     *   string being extracted.
390     *
391     * @param integer $byteCount Number of bytes (characters * 2) to return.
392     * @param integer $byteOrder (optional) Big- or little-endian byte order.
393     *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
394     *   If omitted, uses big-endian.
395     * @param string $characterSet (optional) Desired resulting character set.
396     *   You may use any character set supported by {@link iconv()}. If omitted,
397     *   uses 'current locale'.
398     * @return string
399     * @throws Zend_Pdf_Exception
400     */
401    public function readStringUTF16($byteCount,
402                                    $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN,
403                                    $characterSet = '')
404    {
405        if ($byteCount == 0) {
406            return '';
407        }
408        $bytes = $this->_dataSource->readBytes($byteCount);
409        if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
410            if ($characterSet == 'UTF-16BE') {
411                return $bytes;
412            }
413            return iconv('UTF-16BE', $characterSet, $bytes);
414        } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
415            if ($characterSet == 'UTF-16LE') {
416                return $bytes;
417            }
418            return iconv('UTF-16LE', $characterSet, $bytes);
419        } else {
420            require_once 'Zend/Pdf/Exception.php';
421            throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
422                                         Zend_Pdf_Exception::INVALID_BYTE_ORDER);
423        }
424    }
425
426    /**
427     * Reads the Mac Roman-encoded string from the binary file at the current
428     * byte offset.
429     *
430     * You must supply the desired resulting character set.
431     *
432     * Advances the offset by the number of bytes read. Throws an exception if
433     * an error occurs.
434     *
435     * @param integer $byteCount Number of bytes (characters) to return.
436     * @param string $characterSet (optional) Desired resulting character set.
437     *   You may use any character set supported by {@link iconv()}. If omitted,
438     *   uses 'current locale'.
439     * @return string
440     * @throws Zend_Pdf_Exception
441     */
442    public function readStringMacRoman($byteCount, $characterSet = '')
443    {
444        if ($byteCount == 0) {
445            return '';
446        }
447        $bytes = $this->_dataSource->readBytes($byteCount);
448        if ($characterSet == 'MacRoman') {
449            return $bytes;
450        }
451        return iconv('MacRoman', $characterSet, $bytes);
452    }
453
454    /**
455     * Reads the Pascal string from the binary file at the current byte offset.
456     *
457     * The length of the Pascal string is determined by reading the length bytes
458     * which preceed the character data. You must supply the desired resulting
459     * character set.
460     *
461     * Advances the offset by the number of bytes read. Throws an exception if
462     * an error occurs.
463     *
464     * @param string $characterSet (optional) Desired resulting character set.
465     *   You may use any character set supported by {@link iconv()}. If omitted,
466     *   uses 'current locale'.
467     * @param integer $lengthBytes (optional) Number of bytes that make up the
468     *   length. Default is 1.
469     * @return string
470     * @throws Zend_Pdf_Exception
471     */
472    public function readStringPascal($characterSet = '', $lengthBytes = 1)
473    {
474        $byteCount = $this->readUInt($lengthBytes);
475        if ($byteCount == 0) {
476            return '';
477        }
478        $bytes = $this->_dataSource->readBytes($byteCount);
479        if ($characterSet == 'ASCII') {
480            return $bytes;
481        }
482        return iconv('ASCII', $characterSet, $bytes);
483    }
484
485}