PageRenderTime 74ms CodeModel.GetById 36ms RepoModel.GetById 0ms app.codeStats 0ms

/textlib.php

https://github.com/danmarsden/moodle-plagiarism_crot
PHP | 1324 lines | 859 code | 124 blank | 341 comment | 170 complexity | ec5d652f86a0fe68c7d4a489daf3b090 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /*
  3. This library is to process external files of different types
  4. Adopted mostly from DonRamon
  5. http://habrahabr.ru/blogs/php/70119/
  6. http://habrahabr.ru/blogs/php/69417/
  7. */
  8. // function getTextFromZippedXML
  9. // allows to work with .docx and .odt files
  10. // thanks to DonRamon http://habrahabr.ru/blogs/php/69417/
  11. function getTextFromZippedXML($archiveFile, $contentFile) {
  12. // create zip archive in the memory
  13. $zip = new ZipArchive;
  14. // open zip file
  15. if ($zip->open($archiveFile)) {
  16. // check the file in the archive
  17. if (($index = $zip->locateName($contentFile)) !== false) {
  18. // if found read in text variable
  19. $content = $zip->getFromIndex($index);
  20. // close the archive, we don't need it anymore
  21. $zip->close();
  22. // sw
  23. $content=str_replace("<w:p ","\n<w:p ",$content);
  24. // TODO add all entities and includes
  25. // skip all errors and warnings
  26. $xml = DOMDocument::loadXML($content, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
  27. // return data without wml tags
  28. return strip_tags($xml->saveXML());
  29. } else {echo "Not found!";}
  30. $zip->close();
  31. }
  32. // if something wron return ERROR text
  33. return "ERROR in text Tokenization";
  34. }
  35. //
  36. // functions rtf_isPlainText and rtf2text
  37. // support RTF files
  38. //
  39. // thanks to DonRamon http://habrahabr.ru/blogs/php/70119/
  40. function rtf_isPlainText($s) {
  41. $failAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
  42. for ($i = 0; $i < count($failAt); $i++)
  43. if (!empty($s[$failAt[$i]])) return false;
  44. return true;
  45. }
  46. function rtf2text($filename) {
  47. $text = file_get_contents($filename);
  48. if (!strlen($text))
  49. return "";
  50. // start with empty stack of modifiers
  51. $document = "";
  52. $stack = array();
  53. $j = -1;
  54. // read chars from buffer...
  55. for ($i = 0, $len = strlen($text); $i < $len; $i++) {
  56. $c = $text[$i];
  57. // select what to do with the current char
  58. switch ($c) {
  59. // the most important key \
  60. case "\\":
  61. // read the next char
  62. $nc = $text[$i + 1];
  63. // put into the out stream
  64. if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
  65. elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
  66. elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
  67. // * goes to stack
  68. elseif ($nc == '*') $stack[$j]["*"] = true;
  69. elseif ($nc == "'") {
  70. $hex = substr($text, $i + 2, 2);
  71. if (rtf_isPlainText($stack[$j]))
  72. $document .= html_entity_decode("&#".hexdec($hex).";");
  73. // move the index
  74. $i += 2;
  75. // read the key symbol
  76. } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
  77. $word = "";
  78. $param = null;
  79. // read after \
  80. for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
  81. $nc = $text[$k];
  82. if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
  83. if (empty($param))
  84. $word .= $nc;
  85. else
  86. break;
  87. } elseif ($nc >= '0' && $nc <= '9')
  88. $param .= $nc;
  89. elseif ($nc == '-') {
  90. if (empty($param))
  91. $param .= $nc;
  92. else
  93. break;
  94. // end
  95. } else
  96. break;
  97. }
  98. // move the index
  99. $i += $m - 1;
  100. // read the word
  101. $toText = "";
  102. switch (strtolower($word)) {
  103. case "u":
  104. $toText .= html_entity_decode("&#x".dechex($param).";");
  105. $ucDelta = @$stack[$j]["uc"];
  106. if ($ucDelta > 0)
  107. $i += $ucDelta;
  108. break;
  109. case "par": case "page": case "column": case "line": case "lbr":
  110. $toText .= "\n";
  111. break;
  112. case "emspace": case "enspace": case "qmspace":
  113. $toText .= " ";
  114. break;
  115. case "tab": $toText .= "\t"; break;
  116. case "chdate": $toText .= date("m.d.Y"); break;
  117. case "chdpl": $toText .= date("l, j F Y"); break;
  118. case "chdpa": $toText .= date("D, j M Y"); break;
  119. case "chtime": $toText .= date("H:i:s"); break;
  120. case "emdash": $toText .= html_entity_decode("&mdash;"); break;
  121. case "endash": $toText .= html_entity_decode("&ndash;"); break;
  122. case "bullet": $toText .= html_entity_decode("&#149;"); break;
  123. case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
  124. case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
  125. case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
  126. case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
  127. default:
  128. $stack[$j][strtolower($word)] = empty($param) ? true : $param;
  129. break;
  130. }
  131. if (rtf_isPlainText($stack[$j]))
  132. $document .= $toText;
  133. }
  134. $i++;
  135. break;
  136. case "{":
  137. array_push($stack, $stack[$j++]);
  138. break;
  139. // } removes current stack. Group is over.
  140. case "}":
  141. array_pop($stack);
  142. $j--;
  143. break;
  144. //
  145. case '\0': case '\r': case '\f': case '\n': break;
  146. //
  147. default:
  148. if (rtf_isPlainText($stack[$j]))
  149. $document .= $c;
  150. break;
  151. }
  152. }
  153. //
  154. return $document;
  155. }// end rtf2text
  156. // Reading text from PDF
  157. // Версия 0.3
  158. // Author: Алексей Рембиш a.k.a Ramon
  159. // E-mail: alex@rembish.ru
  160. // Copyright 2009
  161. // Partial translation by Sergey
  162. function decodeAsciiHex($input) {
  163. $output = "";
  164. $isOdd = true;
  165. $isComment = false;
  166. for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
  167. $c = $input[$i];
  168. if($isComment) {
  169. if ($c == '\r' || $c == '\n')
  170. $isComment = false;
  171. continue;
  172. }
  173. switch($c) {
  174. case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
  175. case '%':
  176. $isComment = true;
  177. break;
  178. default:
  179. $code = hexdec($c);
  180. if($code === 0 && $c != '0')
  181. return "";
  182. if($isOdd)
  183. $codeHigh = $code;
  184. else
  185. $output .= chr($codeHigh * 16 + $code);
  186. $isOdd = !$isOdd;
  187. break;
  188. }
  189. }
  190. if($input[$i] != '>')
  191. return "";
  192. if($isOdd)
  193. $output .= chr($codeHigh * 16);
  194. return $output;
  195. }
  196. function decodeAscii85($input) {
  197. $output = "";
  198. $isComment = false;
  199. $ords = array();
  200. for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
  201. $c = $input[$i];
  202. if($isComment) {
  203. if ($c == '\r' || $c == '\n')
  204. $isComment = false;
  205. continue;
  206. }
  207. if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
  208. continue;
  209. if ($c == '%') {
  210. $isComment = true;
  211. continue;
  212. }
  213. if ($c == 'z' && $state === 0) {
  214. $output .= str_repeat(chr(0), 4);
  215. continue;
  216. }
  217. if ($c < '!' || $c > 'u')
  218. return "";
  219. $code = ord($input[$i]) & 0xff;
  220. $ords[$state++] = $code - ord('!');
  221. if ($state == 5) {
  222. $state = 0;
  223. for ($sum = 0, $j = 0; $j < 5; $j++)
  224. $sum = $sum * 85 + $ords[$j];
  225. for ($j = 3; $j >= 0; $j--)
  226. $output .= chr($sum >> ($j * 8));
  227. }
  228. }
  229. if ($state === 1)
  230. return "";
  231. elseif ($state > 1) {
  232. for ($i = 0, $sum = 0; $i < $state; $i++)
  233. $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
  234. for ($i = 0; $i < $state - 1; $i++)
  235. $output .= chr($sum >> ((3 - $i) * 8));
  236. }
  237. return $output;
  238. }
  239. function decodeFlate($input) {
  240. // The most common compression method for data streams in PDF.
  241. // Very easy to deal with using libraries.
  242. return @gzuncompress($input);
  243. }
  244. function getObjectOptions($object) {
  245. // We need to get current object attrbutes. These attributes are
  246. // located between << and >>. Each option starts with /.
  247. $options = array();
  248. if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
  249. // Separate options from each other using /. First empty one should be removed from the array.
  250. $options = explode("/", $options[1]);
  251. @array_shift($options);
  252. // Create handy array for current object attributes
  253. // Attributs that look like "/Option N" will be written to hash
  254. // as "Option" => N, and properties like "/Param", will be written as
  255. // "Param" => true.
  256. $o = array();
  257. for ($j = 0; $j < @count($options); $j++) {
  258. $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
  259. if (strpos($options[$j], " ") !== false) {
  260. $parts = explode(" ", $options[$j]);
  261. $o[$parts[0]] = $parts[1];
  262. } else
  263. $o[$options[$j]] = true;
  264. }
  265. $options = $o;
  266. unset($o);
  267. }
  268. // Return an array of parameters we found
  269. return $options;
  270. }
  271. function getDecodedStream($stream, $options) {
  272. // Now we have a stream that is possibly coded with some compression method(s)
  273. // Lets try to decode it.
  274. $data = "";
  275. // If current stream has Filter attribute, then is is definately compressed or en coded
  276. // Otherwise just return the content
  277. if (empty($options["Filter"]))
  278. $data = $stream;
  279. else {
  280. // If we know the size of data stream from options then we need to cut the data
  281. // using this size, or we may not be able to decode it or maybe something else will go wring
  282. $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
  283. $_stream = substr($stream, 0, $length);
  284. // Looping through options looking for indicatiors of data compression in the current stream.
  285. // PDF supprts many different stuff, but text can be coded either by ASCII Hex, or ASCII 85-base or GZ/Deflate
  286. // We need to look for these keys and apply respecrtive functions for decoding.
  287. // There is another option: Crypt, but we are not going to work with encrypted PDF's.
  288. foreach ($options as $key => $value) {
  289. if ($key == "ASCIIHexDecode")
  290. $_stream = decodeAsciiHex($_stream);
  291. if ($key == "ASCII85Decode")
  292. $_stream = decodeAscii85($_stream);
  293. if ($key == "FlateDecode")
  294. $_stream = decodeFlate($_stream);
  295. }
  296. $data = $_stream;
  297. }
  298. // Return the result
  299. return $data;
  300. }
  301. function getDirtyTexts(&$texts, $textContainers) {
  302. // So we have an array of text contatiners that were taken from both BT and ET.
  303. // Our new task is to find a text in them that would be displayed by viewers
  304. // on the screen. There are many options to do that, Lets check the pair: [...] TJ and Td (...) Tj
  305. for ($j = 0; $j < count($textContainers); $j++) {
  306. // Add the pieces of row data the we found to the general array of text objects.
  307. if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
  308. $texts = array_merge($texts, @$parts[1]);
  309. elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
  310. $texts = array_merge($texts, @$parts[1]);
  311. }
  312. }
  313. function getCharTransformations(&$transformations, $stream) {
  314. // Oh Mama Mia! As far as I know nobody did it before. At least not in the open source.
  315. // We are going to have some fun now - search in symbol transformation streams.
  316. // Under transforation I mean conversion of ony symbol to hex form or even to some kind of sequence.
  317. // We need all the attributes that we can find in the current stream.
  318. // Data between beginbfchar and endbfchar transform one hex-code intn another (or sequence of codes)
  319. // separately. Between beginbfrange and endbfrange the transformation of data sequences is taking place
  320. // and it reduces the number of definitions.
  321. preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
  322. preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
  323. // First of all process separate symbols. Transformaiton string looks as follows:
  324. // - <0123> <abcd> -> 0123 should be transformed to abcd;
  325. // - <0123> <abcd6789> -> 0123 should be transformed to many symbols (abcd and 6789 in this case)
  326. for ($j = 0; $j < count($chars); $j++) {
  327. // There is a number of strings before data list that we are going ot read. We gonna use it later on.
  328. $count = $chars[$j][1];
  329. $current = explode("\n", trim($chars[$j][2]));
  330. // Read data from each string.
  331. for ($k = 0; $k < $count && $k < count($current); $k++) {
  332. // Wrute the transformation we just found. Don't forget about writing leading zeros if there are less then 4 digits..
  333. if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
  334. $transformations[str_pad($map[1], 4, "0")] = $map[2];
  335. }
  336. }
  337. // Now we can deal with sequences. Manuals are saying that they can be one of two possible types
  338. // - <0000> <0020> <0a00> -> in this case <0000> will be substituted with <0a00>, <0001> with <0a01> and so on
  339. // till <0020>, that will be substituted with <0a20>.
  340. // OR
  341. // - <0000> <0002> [<abcd> <01234567> <8900>] -> here it works in a bit different way. We need to look how
  342. // many elemants are located between <0000> and <0002> (its actually three including 0001). After it we assign to each element
  343. // a corresponding value from [ ]: 0000 -> abcd, 0001 -> 0123 4567, а 0002 -> 8900.
  344. for ($j = 0; $j < count($ranges); $j++) {
  345. // We need to cross check the number of elements for transofrmation.
  346. $count = $ranges[$j][1];
  347. $current = explode("\n", trim($ranges[$j][2]));
  348. // Working with each string
  349. for ($k = 0; $k < $count && $k < count($current); $k++) {
  350. // This is first type sequence.
  351. if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
  352. // Convert data into decimal system: looping will be easier.
  353. $from = hexdec($map[1]);
  354. $to = hexdec($map[2]);
  355. $_from = hexdec($map[3]);
  356. // We put all the elements from the sequence into transformations array.
  357. // According to manuals we need also to ass leading zeros if hex-code size is less than 4 symbols.
  358. for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
  359. $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
  360. // Second option.
  361. } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
  362. // This is also beginnigna nd end of the sequence. Split data in [ ] by symbols located near to spaces.
  363. $from = hexdec($map[1]);
  364. $to = hexdec($map[2]);
  365. $parts = preg_split("#\s+#", trim($map[3]));
  366. // Loop through data and assign the new values accordingly.
  367. for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
  368. $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
  369. }
  370. }
  371. }
  372. }
  373. function getTextUsingTransformations($texts, $transformations) {
  374. // Second phase - getting text out of raw data.
  375. // In PDF "dirty" text strings may look as follows:
  376. // - (I love)10(PHP) - in this case text data a re located in (),
  377. // and 10 is number of spaces.
  378. // - <01234567> - in this case we deal with 2 symbols represented in HEX:
  379. // : 0123 and 4567. Substitutions for both should be checked inthe substitution table.
  380. // - (Hello, \123world!) - here \123 is symbol in octal system and we need to handle it properly.
  381. // Lets go. We are accumulating text data processign "raw" pieces of text
  382. $document = "";
  383. for ($i = 0; $i < count($texts); $i++) {
  384. // 2 cases are possible: text can be either in <> (hex) or in () (plain).
  385. $isHex = false;
  386. $isPlain = false;
  387. $hex = "";
  388. $plain = "";
  389. // scan current piece of text.
  390. for ($j = 0; $j < strlen($texts[$i]); $j++) {
  391. // get current char
  392. $c = $texts[$i][$j];
  393. // ...and decide what to do with it.
  394. switch($c) {
  395. // We have hex data in front of us
  396. case "<":
  397. $hex = "";
  398. $isHex = true;
  399. break;
  400. // Hex data are over. Lets parse them.
  401. case ">":
  402. // split the string into chunks of 4 chars...
  403. $hexs = str_split($hex, 4);
  404. // ...and cheking what we can do with each chunk
  405. for ($k = 0; $k < count($hexs); $k++) {
  406. // if there are less then 4 symbols then the manual says that we need to add zeros after them
  407. $chex = str_pad($hexs[$k], 4, "0");
  408. // Checking if current hex-code is already in transformations.
  409. // If this is the case change this piece to the required.
  410. if (isset($transformations[$chex]))
  411. $chex = $transformations[$chex];
  412. // Write a new Unicode symbol into the output .
  413. $document .= html_entity_decode("&#x".$chex.";");
  414. }
  415. // Hex-sata are over. Need to say it.
  416. $isHex = false;
  417. break;
  418. // There is a piece of "plain" text
  419. case "(":
  420. $plain = "";
  421. $isPlain = true;
  422. break;
  423. // Well... this piece will be over sometime.
  424. case ")":
  425. // Get the text we just got into the output stream.
  426. $document .= $plain;
  427. $isPlain = false;
  428. break;
  429. // Specail symbol. Lets see what is located after it.
  430. case "\\":
  431. $c2 = $texts[$i][$j + 1];
  432. // If it is \ ot either one of ( or ), then print them as it is.
  433. if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
  434. // If it is empty space of EOL then process it.
  435. elseif ($c2 == "n") $plain .= '\n';
  436. elseif ($c2 == "r") $plain .= '\r';
  437. elseif ($c2 == "t") $plain .= '\t';
  438. elseif ($c2 == "b") $plain .= '\b';
  439. elseif ($c2 == "f") $plain .= '\f';
  440. // It might happen that a digit follows after \ . It may be up to 3 of them.
  441. // They represent sybmol code in octal system. Lets parse them.
  442. elseif ($c2 >= '0' && $c2 <= '9') {
  443. // We need 3 digits. No more than 3. Digits only.
  444. $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
  445. // Getting the number of characters we already have taken. We need it to shift the position of current char properly.
  446. $j += strlen($oct) - 1;
  447. // Put the respective char into "plain" text.
  448. $plain .= html_entity_decode("&#".octdec($oct).";");
  449. }
  450. // We increased the position of current symbol at least by one. Need to inform parser about that.
  451. $j++;
  452. break;
  453. // If we have something else then write current symbol into temporaty hex string (if we had < before),
  454. default:
  455. if ($isHex)
  456. $hex .= $c;
  457. // or into "plain" string if ( was opeon.
  458. if ($isPlain)
  459. $plain .= $c;
  460. break;
  461. }
  462. }
  463. // Define text blocks by EOL
  464. $document .= "\n";
  465. }
  466. // Return text.
  467. return $document;
  468. }
  469. function pdf2text($filename) {
  470. // Read from the pdf file into string keeping in mind that file may contain binary streams
  471. $infile = @file_get_contents($filename, FILE_BINARY);
  472. if (empty($infile))
  473. return "";
  474. // First iteration. We need to get all the text data from file.
  475. // We'll get only "raw" data after the firs iteration. These data will include positioning,
  476. // hex entries, etc.
  477. $transformations = array();
  478. $texts = array();
  479. // Get list of all files from pdf file.
  480. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
  481. $objects = @$objects[1];
  482. // Let start the crawling. Apart fromthe text we can meet some other stuff including fonts.
  483. for ($i = 0; $i < count($objects); $i++) {
  484. $currentObject = $objects[$i];
  485. // Check if there is data stream in the current object.
  486. // Almost all the time it will be compressed with gzip.
  487. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
  488. $stream = ltrim($stream[1]);
  489. // Read the attributes of this object. We are looking only
  490. // for text, so we have to do minimal cuts to improve the speed
  491. $options = getObjectOptions($currentObject);
  492. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
  493. continue;
  494. // So, we "may" have text in from of us. Lets decode it from binary file to get the plain text.
  495. $data = getDecodedStream($stream, $options);
  496. if (strlen($data)) {
  497. // We need to find text container in the current stream.
  498. // If we will be able to get it the raw text we found will be added to the previous findings.
  499. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
  500. $textContainers = @$textContainers[1];
  501. getDirtyTexts($texts, $textContainers);
  502. // Otherwise we'll try to use symbol transformations that we gonna use on the 2nd step.
  503. } else
  504. getCharTransformations($transformations, $data);
  505. }
  506. }
  507. }
  508. // After the preliminary parsing of pdf-document we need to parse
  509. // the text blocks we got in the context of simbolic transformations. Return the result after we done.
  510. return getTextUsingTransformations($texts, $transformations);
  511. }
  512. // Reading WCBFF
  513. // Version 0.2
  514. // Author: Алексей Рембиш a.k.a Ramon
  515. // E-mail: alex@rembish.ru
  516. // Copyright 2009
  517. // so my little firends, below you can see class that works with WCBFF (Windows Compound Binary File Format).
  518. // Why do we need it? This format serves as a basement for such "delicious" formats as .doc, .xls и .ppt.
  519. // Lets see how it looks like
  520. class cfb {
  521. // We gonna read the content of the file we need to decode into this variable.
  522. protected $data = "";
  523. // Sizes of FAT sector (1 << 9 = 512), Mini FAT sector (1 << 6 = 64) and maximum size
  524. // of the stream that could be written into a miniFAT.
  525. protected $sectorShift = 9;
  526. protected $miniSectorShift = 6;
  527. protected $miniSectorCutoff = 4096;
  528. // FAT-sector sequence array and Array of "files" belonging to this file structure
  529. protected $fatChains = array();
  530. protected $fatEntries = array();
  531. // Array of sequences of Mini FAT-sectors and the whole Mini FAT of our file
  532. protected $miniFATChains = array();
  533. protected $miniFAT = "";
  534. // Version (3 or 4), and way to write numbers (little-endian)
  535. private $version = 3;
  536. private $isLittleEndian = true;
  537. // The number of "files" and the position fo the first "file" in FAT
  538. private $cDir = 0;
  539. private $fDir = 0;
  540. // The number of FAT sectors in the file
  541. private $cFAT = 0;
  542. // The number of miniFAT-sectors and position of sequences of miniFAT-сsectors in the file
  543. private $cMiniFAT = 0;
  544. private $fMiniFAT = 0;
  545. // DIFAT: number of such sectors and offset to sector 110 (first 109 sectors are located in the header)
  546. private $DIFAT = array();
  547. private $cDIFAT = 0;
  548. private $fDIFAT = 0;
  549. // Constants: end of sequence and empty sector (4 bytes each)
  550. const ENDOFCHAIN = 0xFFFFFFFE;
  551. const FREESECT = 0xFFFFFFFF;
  552. // Read the file into internal variable
  553. public function read($filename) {
  554. $this->data = file_get_contents($filename);
  555. }
  556. public function parse() {
  557. // First of all we need to check weither we really have CFB in front of us.?
  558. // To do it we read the first 8 bytes and compare them with 2 patterns: common and the old one
  559. $abSig = strtoupper(bin2hex(substr($this->data, 0, 8)));
  560. if ($abSig != "D0CF11E0A1B11AE1" && $abSig != "0E11FC0DD0CF11E0") { return false; }
  561. // Read the file header;
  562. $this->readHeader();
  563. // get the remaining DIFAT sectors if any;
  564. $this->readDIFAT();
  565. // read the sequence of FAT sectors
  566. $this->readFATChains();
  567. // read the sequence of MiniFAT-sectors
  568. $this->readMiniFATChains();
  569. // read the structure of "directories" within the file
  570. $this->readDirectoryStructure();
  571. // Finally we need to check the root entry in the file structure.
  572. // This stream is required ot be in a file at least because it has a link
  573. // to file's miniFAT that we gonna read into $this->miniFAT
  574. $reStreamID = $this->getStreamIdByName("Root Entry");
  575. if ($reStreamID === false) { return false; }
  576. $this->miniFAT = $this->getStreamById($reStreamID, true);
  577. // Remove the unnecessary link to DIFAT-sectors, we have "stolen" complete FAT sequences instead of them.
  578. unset($this->DIFAT);
  579. // After all this we should be able to work with any of the "upper" formats from Microsoft such as doc, xls или ppt.
  580. }
  581. // Function that looks for stream number in the directory structure by its name.
  582. // It returns false if nothing was found.
  583. public function getStreamIdByName($name) {
  584. for($i = 0; $i < count($this->fatEntries); $i++) {
  585. if ($this->fatEntries[$i]["name"] == $name)
  586. return $i;
  587. }
  588. return false;
  589. }
  590. // Function gets the stream number ($id) and a second parameter (second perameter is required for the root entry only).
  591. // It returns the binary content fo this stream.
  592. public function getStreamById($id, $isRoot = false) {
  593. $entry = $this->fatEntries[$id];
  594. // Get the size and offset position to the content of "current" file.
  595. $from = $entry["start"];
  596. $size = $entry["size"];
  597. // Now 2 options are possible: is size is less than 4096 byte, then we need ot read data
  598. // from MiniFAT. If more than 4096 read from the common FAT. RootEntry is an exclusion:
  599. // we need ot read contents from FAT as miniFAT is located there.
  600. $stream = "";
  601. // So, here is the 1st option: small size and not root.
  602. if ($size < $this->miniSectorCutoff && !$isRoot) {
  603. // Get the miniFAT sector size - 64 bytes
  604. $ssize = 1 << $this->miniSectorShift;
  605. do {
  606. // Get the offset in miniFAT
  607. $start = $from << $this->miniSectorShift;
  608. // Read miniFAT-sector
  609. $stream .= substr($this->miniFAT, $start, $ssize);
  610. // Get the next piece of miniFAT in the array of chains
  611. $from = $this->miniFATChains[$from];
  612. // While not end of chain (sequence).
  613. } while ($from != self::ENDOFCHAIN);
  614. } else {
  615. // Second option - large piece - read it from FAT.
  616. // Get the sector size - 512 (or 4096 for new versions)
  617. $ssize = 1 << $this->sectorShift;
  618. do {
  619. // Getting the offset in the file (taking into account that there is a header of 512 bytes in the begining)
  620. $start = ($from + 1) << $this->sectorShift;
  621. // Read a sector
  622. $stream .= substr($this->data, $start, $ssize);
  623. // Get the next sector inthe array of FAT chains
  624. $from = $this->fatChains[$from];
  625. // While not end of chain (sequence).
  626. } while ($from != self::ENDOFCHAIN);
  627. }
  628. // Return the stream content accrding to its size.
  629. return substr($stream, 0, $size);
  630. }
  631. // This function reads data from file header
  632. private function readHeader() {
  633. // We need to get the information about the data format in the file
  634. $uByteOrder = strtoupper(bin2hex(substr($this->data, 0x1C, 2)));
  635. // We need to check if it is little-endian record
  636. $this->isLittleEndian = $uByteOrder == "FEFF";
  637. // Version 3 or 4 (never actually met 4th, but its description appears in the manual)
  638. $this->version = $this->getShort(0x1A);
  639. // Offsets for FAT and miniFAT
  640. $this->sectorShift = $this->getShort(0x1E);
  641. $this->miniSectorShift = $this->getShort(0x20);
  642. $this->miniSectorCutoff = $this->getLong(0x38);
  643. // Number of entries in the directory and offset to the first description in the file
  644. if ($this->version == 4)
  645. $this->cDir = $this->getLong(0x28);
  646. $this->fDir = $this->getLong(0x30);
  647. // Number of FAT sectors in the file
  648. $this->cFAT = $this->getLong(0x2C);
  649. // Number and position of hte 1st miniFAT-sector of sequences.
  650. $this->cMiniFAT = $this->getLong(0x40);
  651. $this->fMiniFAT = $this->getLong(0x3C);
  652. // Where are the FAT sector chains and how many of them are there.
  653. $this->cDIFAT = $this->getLong(0x48);
  654. $this->fDIFAT = $this->getLong(0x44);
  655. }
  656. // So.... DIFAT. DIFAT shows in which sectors we can find descriptions of FAT sector chains
  657. // Without these chains we won't be able to get stream contents in fragmented files
  658. private function readDIFAT() {
  659. $this->DIFAT = array();
  660. // First 109 links to sequences are being stored in the header of our file
  661. for ($i = 0; $i < 109; $i++)
  662. $this->DIFAT[$i] = $this->getLong(0x4C + $i * 4);
  663. // we also check if there are other links to chains. in small (upto 8.5MB) there is no such
  664. // links but in larger files we have to read them.
  665. if ($this->fDIFAT != self::ENDOFCHAIN) {
  666. // Sector size and start position to read links.
  667. $size = 1 << $this->sectorShift;
  668. $from = $this->fDIFAT;
  669. $j = 0;
  670. do {
  671. // Get the position in the file considering header
  672. $start = ($from + 1) << $this->sectorShift;
  673. // Read the links to sequences' sectors
  674. for ($i = 0; $i < ($size - 4); $i += 4)
  675. $this->DIFAT[] = $this->getLong($start + $i);
  676. // Getting the next DIFAT-sector. Link to this sector is written
  677. // as the last "word" in the current DIFAT-sector
  678. $from = $this->getLong($start + $i);
  679. // Ef sector exists we need to move there
  680. } while ($from != self::ENDOFCHAIN && ++$j < $this->cDIFAT);
  681. }
  682. // Remove the unnecessary links.
  683. while($this->DIFAT[count($this->DIFAT) - 1] == self::FREESECT)
  684. array_pop($this->DIFAT);
  685. }
  686. // So, we done with reading DIFAT. Now chains of FAT sectors should be converted
  687. // Lets go further.
  688. private function readFATChains() {
  689. // Sector size
  690. $size = 1 << $this->sectorShift;
  691. $this->fatChains = array();
  692. // Going through DIFAT array.
  693. for ($i = 0; $i < count($this->DIFAT); $i++) {
  694. // Go to the sector that we were looking for (with the header)
  695. $from = ($this->DIFAT[$i] + 1) << $this->sectorShift;
  696. // Getting the FAT chain: array index is a current sector,
  697. // value from an array s index of the next element or
  698. // ENDOFCHAIN - if it is last element in the chain.
  699. for ($j = 0; $j < $size; $j += 4)
  700. $this->fatChains[] = $this->getLong($from + $j);
  701. }
  702. }
  703. // We done with reading of FAT sequences. Now heed to read MiniFAT-sequences exaactly the same way.
  704. private function readMiniFATChains() {
  705. // Sector size
  706. $size = 1 << $this->sectorShift;
  707. $this->miniFATChains = array();
  708. // Looking for the first sector with MiniFAT- sequences
  709. $from = $this->fMiniFAT;
  710. // If MiniFAT appears to be in file then
  711. while ($from != self::ENDOFCHAIN) {
  712. // Looking for the offset to the sector with MiniFat-sequence
  713. $start = ($from + 1) << $this->sectorShift;
  714. // Read the sequence from the current sector
  715. for ($i = 0; $i < $size; $i += 4)
  716. $this->miniFATChains[] = $this->getLong($start + $i);
  717. // If this is notthe last sector in the chain we need to move forward
  718. $from = $this->fatChains[$from];
  719. }
  720. }
  721. // The most important functions that reads structure of "files" of such a type
  722. // All the FS objects are written into this structure.
  723. private function readDirectoryStructure() {
  724. // get the 1st sector with "files" in file system
  725. $from = $this->fDir;
  726. // Get the sector size
  727. $size = 1 << $this->sectorShift;
  728. $this->fatEntries = array();
  729. do {
  730. // get sector in the file
  731. $start = ($from + 1) << $this->sectorShift;
  732. // Let go through the content of this sector. One sector contains up to 4 (or 128 for version 4)
  733. // entries to FS. Lets read them.
  734. for ($i = 0; $i < $size; $i += 128) {
  735. // Get the binary data
  736. $entry = substr($this->data, $start + $i, 128);
  737. // and prcess these data:
  738. $this->fatEntries[] = array(
  739. // get the entry name
  740. "name" => $this->utf16_to_ansi(substr($entry, 0, $this->getShort(0x40, $entry))),
  741. // and its type: either stream, or user data, or empty sector, etc.
  742. "type" => ord($entry[0x42]),
  743. // its color in the Red-Black tree
  744. "color" => ord($entry[0x43]),
  745. // its "left" siblings
  746. "left" => $this->getLong(0x44, $entry),
  747. // its "right" siblings
  748. "right" => $this->getLong(0x48, $entry),
  749. // its child
  750. "child" => $this->getLong(0x4C, $entry),
  751. // offset to the content in FAT or miniFAT
  752. "start" => $this->getLong(0x74, $entry),
  753. // size of the content
  754. "size" => $this->getSomeBytes($entry, 0x78, 8),
  755. );
  756. }
  757. // get the next sector with descriptions and jump there
  758. $from = $this->fatChains[$from];
  759. // Of course if such a sector exists
  760. } while ($from != self::ENDOFCHAIN);
  761. // remove "empty" entries at the end if any.
  762. while($this->fatEntries[count($this->fatEntries) - 1]["type"] == 0)
  763. array_pop($this->fatEntries);
  764. }
  765. // Support function to get the adequate name of the current entrie in FS.
  766. // Note: names are written in the Unicode.
  767. private function utf16_to_ansi($in) {
  768. $out = "";
  769. for ($i = 0; $i < strlen($in); $i += 2)
  770. $out .= chr($this->getShort($i, $in));
  771. return trim($out);
  772. }
  773. protected function unicode_to_utf8($in, $check = false) {
  774. $out = "";
  775. if ($check && strpos($in, chr(0)) !== 1) {
  776. while (($i = strpos($in, chr(0x13))) !== false) {
  777. $j = strpos($in, chr(0x15), $i + 1);
  778. if ($j === false)
  779. break;
  780. $in = substr_replace($in, "", $i, $j - $i);
  781. }
  782. for ($i = 0; $i < strlen($in); $i++) {
  783. if (ord($in[$i]) >= 32) {}
  784. elseif ($in[$i] == ' ' || $in[$i] == '\n') {}
  785. else
  786. $in = substr_replace($in, "", $i, 1);
  787. }
  788. $in = str_replace(chr(0), "", $in);
  789. return $in;
  790. } elseif ($check) {
  791. while (($i = strpos($in, chr(0x13).chr(0))) !== false) {
  792. $j = strpos($in, chr(0x15).chr(0), $i + 1);
  793. if ($j === false)
  794. break;
  795. $in = substr_replace($in, "", $i, $j - $i);
  796. }
  797. $in = str_replace(chr(0).chr(0), "", $in);
  798. }
  799. // Loop thriugh 2 byte words
  800. $skip = false;
  801. for ($i = 0; $i < strlen($in); $i += 2) {
  802. $cd = substr($in, $i, 2);
  803. if ($skip) {
  804. if (ord($cd[1]) == 0x15 || ord($cd[0]) == 0x15)
  805. $skip = false;
  806. continue;
  807. }
  808. // If upper byte is 0 then this is ANSI
  809. if (ord($cd[1]) == 0) {
  810. // If ASCII value is higher than 32 we will write it as it is.
  811. if (ord($cd[0]) >= 32)
  812. $out .= $cd[0];
  813. elseif ($cd[0] == ' ' || $cd[0] == '\n')
  814. $out .= $cd[0];
  815. elseif (ord($cd[0]) == 0x13)
  816. $skip = true;
  817. else {
  818. continue;
  819. // В противном случае проверяем символы на внедрённые команды (список можно
  820. // дополнить и пополнить).
  821. switch (ord($cd[0])) {
  822. case 0x0D: case 0x07: $out .= "\n"; break;
  823. case 0x08: case 0x01: $out .= ""; break;
  824. case 0x13: $out .= "HYPER13"; break;
  825. case 0x14: $out .= "HYPER14"; break;
  826. case 0x15: $out .= "HYPER15"; break;
  827. default: $out .= " "; break;
  828. }
  829. }
  830. } else { // Иначе преобразовываем в HTML entity
  831. if (ord($cd[1]) == 0x13) {
  832. echo "@";
  833. $skip = true;
  834. continue;
  835. }
  836. $out .= "&#x".sprintf("%04x", $this->getShort(0, $cd)).";";
  837. }
  838. }
  839. // and return the results
  840. return $out;
  841. }
  842. // Support function to geto some bytes from the string
  843. // taking into account order of bytes and converting values into a number.
  844. protected function getSomeBytes($data, $from, $count) {
  845. // Read data from $data by default.
  846. if ($data === null)
  847. $data = $this->data;
  848. // Read a piece
  849. $string = substr($data, $from, $count);
  850. // in case of backward order reverse it
  851. if ($this->isLittleEndian)
  852. $string = strrev($string);
  853. // encode from binary to hex and to a number.
  854. return hexdec(bin2hex($string));
  855. }
  856. // Read a word from the variable (by default from this->data)
  857. protected function getShort($from, $data = null) {
  858. return $this->getSomeBytes($data, $from, 2);
  859. }
  860. // read a double word from the variable (by default from this->data)
  861. protected function getLong($from, $data = null) {
  862. return $this->getSomeBytes($data, $from, 4);
  863. }
  864. }
  865. // Reading text from DOC
  866. // Версия 0.4
  867. // Author: Алексей Рембиш a.k.a Ramon
  868. // E-mail:
  869. // Copyright 2009
  870. // Comments translated by Sergey Butakov
  871. // Class to work with Microsoft Word Document (or just doc). It extends
  872. // Windows Compound Binary File Format. Lets try to find text here
  873. class doc extends cfb {
  874. // This function extends parse funciton and returns text from the file.
  875. // If returns flase if something went wrong.
  876. public function parse() {
  877. parent::parse();
  878. // To read a DOC file we need 2 streams - WordDocument and 0Table or
  879. // 1Table depending on the situation. Lets get hte first stream.
  880. // It contains pieces of text we need to collect.
  881. $wdStreamID = $this->getStreamIdByName("WordDocument");
  882. if ($wdStreamID === false) { return false; }
  883. // We got the stream. Lets read it into a variable
  884. $wdStream = $this->getStreamById($wdStreamID);
  885. // Next we need to get something from FIB - special block named
  886. // File Information Block that is located in the beginning of WordDocument stream.
  887. $bytes = $this->getShort(0x000A, $wdStream);
  888. // Read which table we need to read: number 0 or number 1.
  889. // To do so we need to read a small bit from the header.
  890. $fWhichTblStm = ($bytes & 0x0200) == 0x0200;
  891. //Now we need to get the position of CLX in the table stream. And the size of CLX itself.
  892. $fcClx = $this->getLong(0x01A2, $wdStream);
  893. $lcbClx = $this->getLong(0x01A6, $wdStream);
  894. // Conting few values to separate positions from the size in clx
  895. $ccpText = $this->getLong(0x004C, $wdStream);
  896. $ccpFtn = $this->getLong(0x0050, $wdStream);
  897. $ccpHdd = $this->getLong(0x0054, $wdStream);
  898. $ccpMcr = $this->getLong(0x0058, $wdStream);
  899. $ccpAtn = $this->getLong(0x005C, $wdStream);
  900. $ccpEdn = $this->getLong(0x0060, $wdStream);
  901. $ccpTxbx = $this->getLong(0x0064, $wdStream);
  902. $ccpHdrTxbx = $this->getLong(0x0068, $wdStream);
  903. // Using the value that we just got we can look for the value of the last CP - character position
  904. $lastCP = $ccpFtn + $ccpHdd + $ccpMcr + $ccpAtn + $ccpEdn + $ccpTxbx + $ccpHdrTxbx;
  905. $lastCP += ($lastCP != 0) + $ccpText;
  906. // Get the required table in the file.
  907. $tStreamID = $this->getStreamIdByName(intval($fWhichTblStm)."Table");
  908. if ($tStreamID === false) { return false; }
  909. // And read the stream to a variable
  910. $tStream = $this->getStreamById($tStreamID);
  911. // Потом находим в потоке CLX
  912. $clx = substr($tStream, $fcClx, $lcbClx);
  913. // Now we need to go through CLX (yes... its complex) looking for piece with offsets and sizes of text pieces
  914. $lcbPieceTable = 0;
  915. $pieceTable = "";
  916. // Well... this is the most exciting part. There is not too much of documentation on the web site about
  917. // what can be found before pieceTable in the CLX. So we will do the total search looking
  918. // for the possible beginning of pieceTable (it must start with 0х02), and read the following 4 bytes
  919. // - size of pieceTable. If the actual size equial to size writtent in the offset then Bingo! we found pieceTable.
  920. // If not continue the search.
  921. $from = 0;
  922. // Looking for 0х02 in CLX starting from the current offset
  923. while (($i = strpos($clx, chr(0x02), $from)) !== false) {
  924. // Get the pieceTable size
  925. $lcbPieceTable = $this->getLong($i + 1, $clx);
  926. // Get the pieceTable
  927. $pieceTable = substr($clx, $i + 5);
  928. // If the real size differs from required then this is not what we are lloking for.
  929. // Skip it.
  930. if (strlen($pieceTable) != $lcbPieceTable) {
  931. $from = $i + 1;
  932. continue;
  933. }
  934. // Oh.... we got it!!! its break time my littel friends!
  935. break;
  936. }
  937. // Now we need to fill the array of character positions, until we got the last CP.
  938. $cp = array(); $i = 0;
  939. while (($cp[] = $this->getLong($i, $pieceTable)) != $lastCP)
  940. $i += 4;
  941. // The rest will go as PCD (piece descriptors)
  942. $pcd = str_split(substr($pieceTable, $i + 4), 8);
  943. $text = "";
  944. // Yes! we came to our main goal - reading text from file.
  945. // Go through the descriptors of such pieces
  946. for ($i = 0; $i < count($pcd); $i++) {
  947. // Get the word with offset and compression flag
  948. $fcValue = $this->getLong(2, $pcd[$i]);
  949. // Check what do we have: simple ANSI or Unicode
  950. $isANSI = ($fcValue & 0x40000000) == 0x40000000;
  951. // The rest without top will go as an offset
  952. $fc = $fcValue & 0x3FFFFFFF;
  953. // Get the piece of text
  954. $lcb = $cp[$i + 1] - $cp[$i];
  955. // if htis is Unicode, then lets read twice more bytes.
  956. if (!$isANSI)
  957. $lcb *= 2;
  958. // If ANSI - start twice earlier.
  959. else
  960. $fc /= 2;
  961. // Read a piece from Worddocument stream considering the offset
  962. $part = substr($wdStream, $fc, $lcb);
  963. // If this is a Unicode text then decode it to the regular text
  964. if (!$isANSI)
  965. $part = $this->unicode_to_utf8($part);
  966. // add a piece
  967. $text .= $part;
  968. }
  969. // Remove entries with embedded objects from the file
  970. $text = preg_replace("/HYPER13 *(INCLUDEPICTURE|HTMLCONTROL)(.*)HYPER15/iU", "", $text);
  971. $text = preg_replace("/HYPER13(.*)HYPER14(.*)HYPER15/iU", "$2", $text);
  972. // Return the results
  973. return $text;
  974. }
  975. // Function to convert from Unicode to UTF8
  976. protected function unicode_to_utf8($in) {
  977. $out = "";
  978. // Loop through 2-byte sequences
  979. for ($i = 0; $i < strlen($in); $i += 2) {
  980. $cd = substr($in, $i, 2);
  981. // If the first byte is 0 then this is ANSI
  982. if (ord($cd[1]) == 0) {
  983. // If ASCII value of the low byte is higher than 32 then write it as it is.
  984. if (ord($cd[0]) >= 32)
  985. $out .= $cd[0];
  986. // Otherwise check symbols against embedded commands. Please extend the list ;)
  987. switch (ord($cd[0])) {
  988. case 0x0D: case 0x07: $out .= "\n"; break;
  989. case 0x08: case 0x01: $out .= ""; break;
  990. case 0x13: $out .= "HYPER13"; break;
  991. case 0x14: $out .= "HYPER14"; break;
  992. case 0x15: $out .= "HYPER15"; break;
  993. }
  994. } else // Otherwise convert to HTML entity
  995. $out .= html_entity_decode("&#x".sprintf("%04x", $this->getShort(0, $cd)).";");
  996. }
  997. // And... return the result
  998. return $out;
  999. }
  1000. }
  1001. // Function to convert doc to plain-text. For those who "don't need classes".
  1002. function doc2text($filename) {
  1003. $doc = new doc;
  1004. $doc->read($filename);
  1005. return $doc->parse();
  1006. }
  1007. // Reading text from PPT
  1008. // Version 0.3
  1009. // Auhtor: Алексей Рембиш a.k.a Ramon
  1010. // E-mail: alex@rembish.ru
  1011. // Copyright 2009
  1012. // Comments translated by Sergey
  1013. class ppt extends cfb {
  1014. public function parse() {
  1015. parent::parse();
  1016. // File must have Current User stream.
  1017. $cuStreamID = $this->getStreamIdByName("Current User");
  1018. if ($cuStreamID === false) { return false; }
  1019. // Get this stream and check hash (do we really have PowerPoint-presentation?)
  1020. // and read the offset to the first ocurence of UserEditAtom
  1021. $cuStream = $this->getStreamById($cuStreamID);
  1022. if ($this->getLong(12, $cuStream) == 0xF3D1C4DF) { return false; }
  1023. $offsetToCurrentEdit = $this->getLong(16, $cuStream);
  1024. // Getting stream named PowerPoint Document.
  1025. $ppdStreamID = $this->getStreamIdByName("PowerPoint Document");
  1026. if ($ppdStreamID === false) { return false; }
  1027. $ppdStream = $this->getStreamById($ppdStreamID);
  1028. // Look for all UserEditAtoms in PPT document. We need UserEditAtoms to get offsets to PersistDirectory.
  1029. $offsetLastEdit = $offsetToCurrentEdit;
  1030. $persistDirEntry = array();
  1031. $live = null;
  1032. $offsetPersistDirectory = array();
  1033. do {
  1034. $userEditAtom = $this->getRecord($ppdStream, $offsetLastEdit, 0x0FF5);
  1035. $live = &$userEditAtom;
  1036. array_unshift($offsetPersistDi

Large files files are truncated, but you can click here to view the full file