/modules/AOD_Index/PdfParser.php
PHP | 337 lines | 210 code | 44 blank | 83 comment | 32 complexity | be5bfea712504059bba6b8b5d09df9f0 MD5 | raw file
- <?php
- /**
- * @file
- * Class PdfParser
- *
- * @author : Sebastien MALOT <sebastien@malot.fr>
- * @date : 2013-08-08
- *
- * References :
- * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
- * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
- * - http://www.php.net/manual/en/ref.pdf.php#74211
- */
- class PdfParser
- {
- /**
- * Parse PDF file
- *
- * @param string $filename
- * @return string
- */
- public static function parseFile($filename)
- {
- $content = file_get_contents($filename);
- return self::extractText($content);
- }
- /**
- * Parse PDF content
- *
- * @param string $content
- * @return string
- */
- public static function parseContent($content)
- {
- return self::extractText($content);
- }
- /**
- * Convert a PDF into text.
- *
- * @param string $filename The filename to extract the data from.
- * @return string The extracted text from the PDF
- */
- protected static function extractText($data)
- {
- /**
- * Split apart the PDF document into sections. We will address each
- * section separately.
- */
- $a_obj = self::getDataArray($data, 'obj', 'endobj');
- $j = 0;
- $a_chunks = array();
- /**
- * Attempt to extract each part of the PDF document into a 'filter'
- * element and a 'data' element. This can then be used to decode the
- * data.
- */
- foreach ($a_obj as $obj) {
- $a_filter = self::getDataArray($obj, '<<', '>>');
- if (is_array($a_filter) && isset($a_filter[0])) {
- $a_chunks[$j]['filter'] = $a_filter[0];
- $a_data = self::getDataArray($obj, 'stream', 'endstream');
- if (is_array($a_data) && isset($a_data[0])) {
- $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
- }
- $j++;
- }
- }
- $result_data = null;
- // decode the chunks
- foreach ($a_chunks as $chunk) {
- // Look at each chunk decide if we can decode it by looking at the contents of the filter
- if (isset($chunk['data'])) {
- // look at the filter to find out which encoding has been used
- if (strpos($chunk['filter'], 'FlateDecode') !== false) {
- // Use gzuncompress but suppress error messages.
- $data =@ gzuncompress($chunk['data']);
- } else {
- $data = $chunk['data'];
- }
- if (trim($data) != '') {
- // If we got data then attempt to extract it.
- $result_data .= ' ' . self::extractTextElements($data);
- }
- }
- }
- /**
- * Make sure we don't have large blocks of white space before and after
- * our string. Also extract alphanumerical information to reduce
- * redundant data.
- */
- if (trim($result_data) == '') {
- return null;
- } else {
- // Optimize hyphened words
- $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
- $result_data = preg_replace('/\s+/', ' ', $result_data);
- return $result_data;
- }
- }
- protected static function extractTextElements($content)
- {
- if (strpos($content, '/CIDInit') === 0) {
- return '';
- }
- $text = '';
- $lines = explode("\n", $content);
- foreach ($lines as $line) {
- $line = trim($line);
- $matches = array();
- // Parse each lines to extract command and operator values
- if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
- $command = trim($matches['command']);
- // Convert octal encoding
- $found_octal_values = array();
- preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);
- foreach($found_octal_values[0] as $value) {
- $octal = substr($value, 1);
- if (intval($octal) < 40) {
- // Skips non printable chars
- $command = str_replace($value, '', $command);
- } else {
- $command = str_replace($value, chr(octdec($octal)), $command);
- }
- }
- // Removes encoded new lines, tabs, ...
- $command = preg_replace('/\\\\[\r\n]/', '', $command);
- $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
- // Force UTF-8 charset
- $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
- if (strtoupper($encoding) != 'UTF-8') {
- if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
- $command = $decoded;
- }
- }
- // Removes leading spaces
- $operator = trim($matches['operator']);
- } else {
- $command = $line;
- $operator = '';
- }
- // Handle main operators
- switch ($operator) {
- // Set character spacing.
- case 'Tc':
- break;
- // Move text current point.
- case 'Td':
- $values = explode(' ', $command);
- $y = array_pop($values);
- $x = array_pop($values);
- if ($x > 0) {
- $text .= ' ';
- }
- if ($y < 0) {
- $text .= ' ';
- }
- break;
- // Move text current point and set leading.
- case 'TD':
- $values = explode(' ', $command);
- $y = array_pop($values);
- if ($y < 0) {
- $text .= "\n";
- }
- break;
- // Set font name and size.
- case 'Tf':
- $text.= ' ';
- break;
- // Display text, allowing individual character positioning
- case 'TJ':
- $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
- $end = mb_strrpos($command, ']', null, 'UTF-8');
- $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
- break;
- // Display text.
- case 'Tj':
- $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
- $end = mb_strrpos($command, ')', null, 'UTF-8');
- $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
- break;
- // Set leading.
- case 'TL':
- // Set text matrix.
- case 'Tm':
- // $text.= ' ';
- break;
- // Set text rendering mode.
- case 'Tr':
- break;
- // Set super/subscripting text rise.
- case 'Ts':
- break;
- // Set text spacing.
- case 'Tw':
- break;
- // Set horizontal scaling.
- case 'Tz':
- break;
- // Move to start of next line.
- case 'T*':
- $text.= "\n";
- break;
- // Internal use
- case 'g':
- case 'gs':
- case 're':
- case 'f':
- // Begin text
- case 'BT':
- // End text
- case 'ET':
- break;
- case '':
- break;
- default:
- }
- }
- $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);
- return $text;
- }
- /**
- * Strip out the text from a small chunk of data.
- *
- * @param string $text
- * @param int $font_size Currently not used
- *
- * @return string
- */
- protected static function parseTextCommand($text, $font_size = 0) {
- $result = '';
- $cur_start_pos = 0;
- while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
- // New text element found
- if ($cur_start_text - $cur_start_pos > 8) {
- $spacing = ' ';
- } else {
- $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');
- if ($spacing_size < -50) {
- $spacing = ' ';
- } else {
- $spacing = '';
- }
- }
- $cur_start_text++;
- $start_search_end = $cur_start_text;
- while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
- if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
- break;
- }
- $start_search_end = $cur_start_pos + 1;
- }
- // something wrong happened
- if ($cur_start_pos === false) {
- break;
- }
- // Add to result
- $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
- $cur_start_pos++;
- }
- return $result;
- }
- /**
- * Convert a section of data into an array, separated by the start and end words.
- *
- * @param string $data The data.
- * @param string $start_word The start of each section of data.
- * @param string $end_word The end of each section of data.
- * @return array The array of data.
- */
- protected static function getDataArray($data, $start_word, $end_word)
- {
- $start = 0;
- $end = 0;
- $a_results = array();
- while ($start !== false && $end !== false) {
- $start = strpos($data, $start_word, $end);
- $end = strpos($data, $end_word, $start);
- if ($end !== false && $start !== false) {
- // data is between start and end
- $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
- }
- }
- return $a_results;
- }
- }