/classes/class.pdf2txt.inc.php
https://bitbucket.org/xongie/rexsearch · PHP · 297 lines · 231 code · 33 blank · 33 comment · 21 complexity · 40315fe55e6402edb415b4ce49c22d69 MD5 · raw file
- <?php
- class pdf2txt
- {
- var $src;
- var $dest;
- var $data;
-
- // constructor
- function pdf2txt($_src = false, $_dest = false)
- {
- $this->setSource($_src);
- $this->setDestination($_dest);
- }
-
- // set data if no conversion from file nescessary
- function setInput($_data)
- {
- $this->data = $_data;
- }
-
- // sets the source-file
- function setSource($_src)
- {
- $this->src = $_src;
- }
-
- // sets the destination-file
- function setDestination($_dest)
- {
- $this->dest = $_dest;
- }
-
- static function directConvert($_data)
- {
- $pdf2txt = new self();
- return $pdf2txt->convert($_data);
- }
-
- // convert to pdf
- function convert($_data = false)
- {
- if(false !== $_data)
- $this->data = $_data;
-
-
- if(
- // load from file?
- (false !== $this->src) AND
- // file exists?
- (false === $this->data = file_get_contents($this->src))
- )
- {
- // [ ERROR ]
- // file does not exist
- return false;
- }
-
- if($this->data === false)
- {
- // [ ERROR ]
- // nothing to convert
- return false;
- }
-
-
- // ###############################
- // data available -> start parsing
- // ###############################
-
- // parse encoding
- preg_match('~/Encoding\s*/(\w+)~ism', $this->data, $encoding);
-
- // detect encoding and assume that there is only a single charset for the hole document
- $fromEncoding = 'windows-1252';
- switch($encoding[1])
- {
- case 'MacRomanEncoding':
- $fromEncoding = 'macintosh';
- break;
-
- case 'WinAnsiEncoding':
- // standard encoding
- break;
- }
-
- // parse data
- // the following code ignores the keyword "stream" and "endstream" if they are in a string
- $isStream = false;
- $stream = '';
- $streams = array();
- $openBracketCount = 0;
- $encodedStream = false;
- foreach(preg_split('~(<<\s*/.*?>>\s*stream\s*)|(\s*endstream\s*)|(\()|(\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
- {
- if(preg_match('~<<\s*/(.*?)>>\s*stream\s*~ism', $part, $match))
- {
- $switch = 'stream';
- if(false !== strpos($match[1], '/Filter'))
- $encodedStream = true;
- }
- else
- $switch = trim($part);
- switch($switch)
- {
- case '(':
- if($isStream AND !$encodedStream)
- $openBracketCount++;
- break;
-
- case ')':
- if($isStream AND !$encodedStream)
- $openBracketCount--;
- break;
-
- case 'endstream':
- if($isStream AND $openBracketCount <= 0)
- {
- $isStream = false;
- $streams[] = $stream;
- $stream = '';
- $encodedStream = false;
- }
- break;
- }
-
- if($isStream)
- {
- $stream .= $part;
- }
-
- if($switch == 'stream')
- {
- if($isStream)
- $stream .= $part;
- else
- $isStream = true;
- }
- }
-
- $textObjects = array();
- foreach($streams as $k => $stream)
- {
- // uncompress the stream
- if(false === $uncompressed = @gzuncompress($stream))
- // if nothing to uncompress, assume that the stream is already uncompressed
- $uncompressed = $stream;
-
- // convert to internal encoding UTF-8
- $uncompressed = @iconv($fromEncoding, 'UTF-8', $uncompressed);
-
- // replace escaped brackets with placeholders
- $text = str_replace(array('\(','\)','\[','\]'), array('##STARTBRACKET##','##ENDBRACKET##','##STARTSBRACKET##','##ENDSBRACKET##'), $uncompressed);
-
- // parse streams
- // the following code ignores the keyword "BT" and "ET" if they are in a string
- $isTextObj = false;
- $textObject = '';
- $openBracketCount = 0;
- foreach(preg_split('~(\s*BT\s+)|(\s+ET\s+)|(\()|(\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
- {
- $switch = trim($part);
- switch($switch)
- {
- case '(':
- if($isTextObj)
- $openBracketCount++;
- break;
-
- case ')':
- if($isTextObj)
- $openBracketCount--;
- break;
-
- case 'ET':
- if($isTextObj AND $openBracketCount <= 0)
- {
- $isTextObj = false;
- $textObjects[] = $textObject;
- $textObject = '';
- }
- break;
- }
-
- if($isTextObj)
- {
- $textObject .= $part;
- }
-
- if($switch == 'BT')
- {
- if($isTextObj)
- $textObject .= $part;
- else
- $isTextObj = true;
- }
- }
- }
-
- $return = '';
- $string = '';
- foreach($textObjects as $textObject)
- {
- // parse text-objects
- // the following code ignores PDF-keywords if they are in a string
- $isString = false;
- $openBracketCount = 0;
-
- foreach(preg_split('~(?:\s+(Td|TD|T\*|"|\')\s+)|(\()|(\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
- {
- switch($part)
- {
- // new line
- case 'Td':
- case 'TD':
- case 'T*':
- case '"':
- case "'":
- if(!$isString)
- $return .= "\n";
- break;
-
- case ')':
- if($isString AND $openBracketCount <= 0)
- {
- $isString = false;
- $return .= $string;
- $string = '';
- }
- elseif($isString)
- $openBracketCount--;
- break;
- }
-
- if($isString)
- {
- $string .= $part;
- }
-
- if($part == '(')
- {
- if($isString)
- {
- $openBracketCount++;
- }
- else
- {
- $isString = true;
- }
- }
- }
-
- $return .= "\n";
- }
-
- // substitute the placeholders for the brackets and escape sequences
- $convert = array(
- '##STARTBRACKET##' => '(',
- '##ENDBRACKET##' => ')',
- '##STARTSBRACKET##' => '[',
- '##ENDSBRACKET##' => ']',
- "\\\n" => "\n",
- "\\\r" => "\n",
- "\\\n\r" => "\n",
- "\\\t" => "\t",
- "\\\b" => "\b",
- "\\\f" => "\f",
- '\\\\' => '\\'
- );
-
- // replace octal character codes
- $text = preg_replace_callback(
- '~\\\\([0-8]{3})~',
- create_function(
- '$matches',
- ' if(octdec($matches[1]) > 32)
- return utf8_encode(chr(octdec($matches[1])));
- else
- return "";
- '
- ),
- $return
- );
-
- // execute conversion with $convert
- $text = strtr(($text), $convert);
-
- if(false !== $this->dest)
- // store $text into the specified destination file
- // and return true on success or false on error
- return false !== file_put_contents($this->dest);
- else
- // return $text
- return $text;
- }
- }
- ?>