PageRenderTime 25ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/AOD_Index/PdfParser.php

https://gitlab.com/tjaafar/SuiteCRM
PHP | 337 lines | 210 code | 44 blank | 83 comment | 32 complexity | be5bfea712504059bba6b8b5d09df9f0 MD5 | raw file
  1. <?php
  2. /**
  3. * @file
  4. * Class PdfParser
  5. *
  6. * @author : Sebastien MALOT <sebastien@malot.fr>
  7. * @date : 2013-08-08
  8. *
  9. * References :
  10. * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
  11. * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
  12. * - http://www.php.net/manual/en/ref.pdf.php#74211
  13. */
  14. class PdfParser
  15. {
  16. /**
  17. * Parse PDF file
  18. *
  19. * @param string $filename
  20. * @return string
  21. */
  22. public static function parseFile($filename)
  23. {
  24. $content = file_get_contents($filename);
  25. return self::extractText($content);
  26. }
  27. /**
  28. * Parse PDF content
  29. *
  30. * @param string $content
  31. * @return string
  32. */
  33. public static function parseContent($content)
  34. {
  35. return self::extractText($content);
  36. }
  37. /**
  38. * Convert a PDF into text.
  39. *
  40. * @param string $filename The filename to extract the data from.
  41. * @return string The extracted text from the PDF
  42. */
  43. protected static function extractText($data)
  44. {
  45. /**
  46. * Split apart the PDF document into sections. We will address each
  47. * section separately.
  48. */
  49. $a_obj = self::getDataArray($data, 'obj', 'endobj');
  50. $j = 0;
  51. $a_chunks = array();
  52. /**
  53. * Attempt to extract each part of the PDF document into a 'filter'
  54. * element and a 'data' element. This can then be used to decode the
  55. * data.
  56. */
  57. foreach ($a_obj as $obj) {
  58. $a_filter = self::getDataArray($obj, '<<', '>>');
  59. if (is_array($a_filter) && isset($a_filter[0])) {
  60. $a_chunks[$j]['filter'] = $a_filter[0];
  61. $a_data = self::getDataArray($obj, 'stream', 'endstream');
  62. if (is_array($a_data) && isset($a_data[0])) {
  63. $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
  64. }
  65. $j++;
  66. }
  67. }
  68. $result_data = null;
  69. // decode the chunks
  70. foreach ($a_chunks as $chunk) {
  71. // Look at each chunk decide if we can decode it by looking at the contents of the filter
  72. if (isset($chunk['data'])) {
  73. // look at the filter to find out which encoding has been used
  74. if (strpos($chunk['filter'], 'FlateDecode') !== false) {
  75. // Use gzuncompress but suppress error messages.
  76. $data =@ gzuncompress($chunk['data']);
  77. } else {
  78. $data = $chunk['data'];
  79. }
  80. if (trim($data) != '') {
  81. // If we got data then attempt to extract it.
  82. $result_data .= ' ' . self::extractTextElements($data);
  83. }
  84. }
  85. }
  86. /**
  87. * Make sure we don't have large blocks of white space before and after
  88. * our string. Also extract alphanumerical information to reduce
  89. * redundant data.
  90. */
  91. if (trim($result_data) == '') {
  92. return null;
  93. } else {
  94. // Optimize hyphened words
  95. $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
  96. $result_data = preg_replace('/\s+/', ' ', $result_data);
  97. return $result_data;
  98. }
  99. }
  100. protected static function extractTextElements($content)
  101. {
  102. if (strpos($content, '/CIDInit') === 0) {
  103. return '';
  104. }
  105. $text = '';
  106. $lines = explode("\n", $content);
  107. foreach ($lines as $line) {
  108. $line = trim($line);
  109. $matches = array();
  110. // Parse each lines to extract command and operator values
  111. if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
  112. $command = trim($matches['command']);
  113. // Convert octal encoding
  114. $found_octal_values = array();
  115. preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);
  116. foreach($found_octal_values[0] as $value) {
  117. $octal = substr($value, 1);
  118. if (intval($octal) < 40) {
  119. // Skips non printable chars
  120. $command = str_replace($value, '', $command);
  121. } else {
  122. $command = str_replace($value, chr(octdec($octal)), $command);
  123. }
  124. }
  125. // Removes encoded new lines, tabs, ...
  126. $command = preg_replace('/\\\\[\r\n]/', '', $command);
  127. $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
  128. // Force UTF-8 charset
  129. $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
  130. if (strtoupper($encoding) != 'UTF-8') {
  131. if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
  132. $command = $decoded;
  133. }
  134. }
  135. // Removes leading spaces
  136. $operator = trim($matches['operator']);
  137. } else {
  138. $command = $line;
  139. $operator = '';
  140. }
  141. // Handle main operators
  142. switch ($operator) {
  143. // Set character spacing.
  144. case 'Tc':
  145. break;
  146. // Move text current point.
  147. case 'Td':
  148. $values = explode(' ', $command);
  149. $y = array_pop($values);
  150. $x = array_pop($values);
  151. if ($x > 0) {
  152. $text .= ' ';
  153. }
  154. if ($y < 0) {
  155. $text .= ' ';
  156. }
  157. break;
  158. // Move text current point and set leading.
  159. case 'TD':
  160. $values = explode(' ', $command);
  161. $y = array_pop($values);
  162. if ($y < 0) {
  163. $text .= "\n";
  164. }
  165. break;
  166. // Set font name and size.
  167. case 'Tf':
  168. $text.= ' ';
  169. break;
  170. // Display text, allowing individual character positioning
  171. case 'TJ':
  172. $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
  173. $end = mb_strrpos($command, ']', null, 'UTF-8');
  174. $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
  175. break;
  176. // Display text.
  177. case 'Tj':
  178. $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
  179. $end = mb_strrpos($command, ')', null, 'UTF-8');
  180. $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
  181. break;
  182. // Set leading.
  183. case 'TL':
  184. // Set text matrix.
  185. case 'Tm':
  186. // $text.= ' ';
  187. break;
  188. // Set text rendering mode.
  189. case 'Tr':
  190. break;
  191. // Set super/subscripting text rise.
  192. case 'Ts':
  193. break;
  194. // Set text spacing.
  195. case 'Tw':
  196. break;
  197. // Set horizontal scaling.
  198. case 'Tz':
  199. break;
  200. // Move to start of next line.
  201. case 'T*':
  202. $text.= "\n";
  203. break;
  204. // Internal use
  205. case 'g':
  206. case 'gs':
  207. case 're':
  208. case 'f':
  209. // Begin text
  210. case 'BT':
  211. // End text
  212. case 'ET':
  213. break;
  214. case '':
  215. break;
  216. default:
  217. }
  218. }
  219. $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);
  220. return $text;
  221. }
  222. /**
  223. * Strip out the text from a small chunk of data.
  224. *
  225. * @param string $text
  226. * @param int $font_size Currently not used
  227. *
  228. * @return string
  229. */
  230. protected static function parseTextCommand($text, $font_size = 0) {
  231. $result = '';
  232. $cur_start_pos = 0;
  233. while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
  234. // New text element found
  235. if ($cur_start_text - $cur_start_pos > 8) {
  236. $spacing = ' ';
  237. } else {
  238. $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');
  239. if ($spacing_size < -50) {
  240. $spacing = ' ';
  241. } else {
  242. $spacing = '';
  243. }
  244. }
  245. $cur_start_text++;
  246. $start_search_end = $cur_start_text;
  247. while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
  248. if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
  249. break;
  250. }
  251. $start_search_end = $cur_start_pos + 1;
  252. }
  253. // something wrong happened
  254. if ($cur_start_pos === false) {
  255. break;
  256. }
  257. // Add to result
  258. $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
  259. $cur_start_pos++;
  260. }
  261. return $result;
  262. }
  263. /**
  264. * Convert a section of data into an array, separated by the start and end words.
  265. *
  266. * @param string $data The data.
  267. * @param string $start_word The start of each section of data.
  268. * @param string $end_word The end of each section of data.
  269. * @return array The array of data.
  270. */
  271. protected static function getDataArray($data, $start_word, $end_word)
  272. {
  273. $start = 0;
  274. $end = 0;
  275. $a_results = array();
  276. while ($start !== false && $end !== false) {
  277. $start = strpos($data, $start_word, $end);
  278. $end = strpos($data, $end_word, $start);
  279. if ($end !== false && $start !== false) {
  280. // data is between start and end
  281. $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
  282. }
  283. }
  284. return $a_results;
  285. }
  286. }