/classes/class.pdf2txt.inc.php

https://bitbucket.org/xongie/rexsearch · PHP · 297 lines · 231 code · 33 blank · 33 comment · 21 complexity · 40315fe55e6402edb415b4ce49c22d69 MD5 · raw file

  1. <?php
  2. class pdf2txt
  3. {
  4. var $src;
  5. var $dest;
  6. var $data;
  7. // constructor
  8. function pdf2txt($_src = false, $_dest = false)
  9. {
  10. $this->setSource($_src);
  11. $this->setDestination($_dest);
  12. }
  13. // set data if no conversion from file nescessary
  14. function setInput($_data)
  15. {
  16. $this->data = $_data;
  17. }
  18. // sets the source-file
  19. function setSource($_src)
  20. {
  21. $this->src = $_src;
  22. }
  23. // sets the destination-file
  24. function setDestination($_dest)
  25. {
  26. $this->dest = $_dest;
  27. }
  28. static function directConvert($_data)
  29. {
  30. $pdf2txt = new self();
  31. return $pdf2txt->convert($_data);
  32. }
  33. // convert to pdf
  34. function convert($_data = false)
  35. {
  36. if(false !== $_data)
  37. $this->data = $_data;
  38. if(
  39. // load from file?
  40. (false !== $this->src) AND
  41. // file exists?
  42. (false === $this->data = file_get_contents($this->src))
  43. )
  44. {
  45. // [ ERROR ]
  46. // file does not exist
  47. return false;
  48. }
  49. if($this->data === false)
  50. {
  51. // [ ERROR ]
  52. // nothing to convert
  53. return false;
  54. }
  55. // ###############################
  56. // data available -> start parsing
  57. // ###############################
  58. // parse encoding
  59. preg_match('~/Encoding\s*/(\w+)~ism', $this->data, $encoding);
  60. // detect encoding and assume that there is only a single charset for the hole document
  61. $fromEncoding = 'windows-1252';
  62. switch($encoding[1])
  63. {
  64. case 'MacRomanEncoding':
  65. $fromEncoding = 'macintosh';
  66. break;
  67. case 'WinAnsiEncoding':
  68. // standard encoding
  69. break;
  70. }
  71. // parse data
  72. // the following code ignores the keyword "stream" and "endstream" if they are in a string
  73. $isStream = false;
  74. $stream = '';
  75. $streams = array();
  76. $openBracketCount = 0;
  77. $encodedStream = false;
  78. foreach(preg_split('~(<<\s*/.*?>>\s*stream\s*)|(\s*endstream\s*)|(\()|(\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
  79. {
  80. if(preg_match('~<<\s*/(.*?)>>\s*stream\s*~ism', $part, $match))
  81. {
  82. $switch = 'stream';
  83. if(false !== strpos($match[1], '/Filter'))
  84. $encodedStream = true;
  85. }
  86. else
  87. $switch = trim($part);
  88. switch($switch)
  89. {
  90. case '(':
  91. if($isStream AND !$encodedStream)
  92. $openBracketCount++;
  93. break;
  94. case ')':
  95. if($isStream AND !$encodedStream)
  96. $openBracketCount--;
  97. break;
  98. case 'endstream':
  99. if($isStream AND $openBracketCount <= 0)
  100. {
  101. $isStream = false;
  102. $streams[] = $stream;
  103. $stream = '';
  104. $encodedStream = false;
  105. }
  106. break;
  107. }
  108. if($isStream)
  109. {
  110. $stream .= $part;
  111. }
  112. if($switch == 'stream')
  113. {
  114. if($isStream)
  115. $stream .= $part;
  116. else
  117. $isStream = true;
  118. }
  119. }
  120. $textObjects = array();
  121. foreach($streams as $k => $stream)
  122. {
  123. // uncompress the stream
  124. if(false === $uncompressed = @gzuncompress($stream))
  125. // if nothing to uncompress, assume that the stream is already uncompressed
  126. $uncompressed = $stream;
  127. // convert to internal encoding UTF-8
  128. $uncompressed = @iconv($fromEncoding, 'UTF-8', $uncompressed);
  129. // replace escaped brackets with placeholders
  130. $text = str_replace(array('\(','\)','\[','\]'), array('##STARTBRACKET##','##ENDBRACKET##','##STARTSBRACKET##','##ENDSBRACKET##'), $uncompressed);
  131. // parse streams
  132. // the following code ignores the keyword "BT" and "ET" if they are in a string
  133. $isTextObj = false;
  134. $textObject = '';
  135. $openBracketCount = 0;
  136. foreach(preg_split('~(\s*BT\s+)|(\s+ET\s+)|(\()|(\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
  137. {
  138. $switch = trim($part);
  139. switch($switch)
  140. {
  141. case '(':
  142. if($isTextObj)
  143. $openBracketCount++;
  144. break;
  145. case ')':
  146. if($isTextObj)
  147. $openBracketCount--;
  148. break;
  149. case 'ET':
  150. if($isTextObj AND $openBracketCount <= 0)
  151. {
  152. $isTextObj = false;
  153. $textObjects[] = $textObject;
  154. $textObject = '';
  155. }
  156. break;
  157. }
  158. if($isTextObj)
  159. {
  160. $textObject .= $part;
  161. }
  162. if($switch == 'BT')
  163. {
  164. if($isTextObj)
  165. $textObject .= $part;
  166. else
  167. $isTextObj = true;
  168. }
  169. }
  170. }
  171. $return = '';
  172. $string = '';
  173. foreach($textObjects as $textObject)
  174. {
  175. // parse text-objects
  176. // the following code ignores PDF-keywords if they are in a string
  177. $isString = false;
  178. $openBracketCount = 0;
  179. foreach(preg_split('~(?:\s+(Td|TD|T\*|"|\')\s+)|(\()|(\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
  180. {
  181. switch($part)
  182. {
  183. // new line
  184. case 'Td':
  185. case 'TD':
  186. case 'T*':
  187. case '"':
  188. case "'":
  189. if(!$isString)
  190. $return .= "\n";
  191. break;
  192. case ')':
  193. if($isString AND $openBracketCount <= 0)
  194. {
  195. $isString = false;
  196. $return .= $string;
  197. $string = '';
  198. }
  199. elseif($isString)
  200. $openBracketCount--;
  201. break;
  202. }
  203. if($isString)
  204. {
  205. $string .= $part;
  206. }
  207. if($part == '(')
  208. {
  209. if($isString)
  210. {
  211. $openBracketCount++;
  212. }
  213. else
  214. {
  215. $isString = true;
  216. }
  217. }
  218. }
  219. $return .= "\n";
  220. }
  221. // substitute the placeholders for the brackets and escape sequences
  222. $convert = array(
  223. '##STARTBRACKET##' => '(',
  224. '##ENDBRACKET##' => ')',
  225. '##STARTSBRACKET##' => '[',
  226. '##ENDSBRACKET##' => ']',
  227. "\\\n" => "\n",
  228. "\\\r" => "\n",
  229. "\\\n\r" => "\n",
  230. "\\\t" => "\t",
  231. "\\\b" => "\b",
  232. "\\\f" => "\f",
  233. '\\\\' => '\\'
  234. );
  235. // replace octal character codes
  236. $text = preg_replace_callback(
  237. '~\\\\([0-8]{3})~',
  238. create_function(
  239. '$matches',
  240. ' if(octdec($matches[1]) > 32)
  241. return utf8_encode(chr(octdec($matches[1])));
  242. else
  243. return "";
  244. '
  245. ),
  246. $return
  247. );
  248. // execute conversion with $convert
  249. $text = strtr(($text), $convert);
  250. if(false !== $this->dest)
  251. // store $text into the specified destination file
  252. // and return true on success or false on error
  253. return false !== file_put_contents($this->dest);
  254. else
  255. // return $text
  256. return $text;
  257. }
  258. }
  259. ?>