/textlib.php
PHP | 1324 lines | 859 code | 124 blank | 341 comment | 170 complexity | ec5d652f86a0fe68c7d4a489daf3b090 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- <?php
- /*
- This library is to process external files of different types
- Adopted mostly from DonRamon
- http://habrahabr.ru/blogs/php/70119/
- http://habrahabr.ru/blogs/php/69417/
- */
- // function getTextFromZippedXML
- // allows to work with .docx and .odt files
- // thanks to DonRamon http://habrahabr.ru/blogs/php/69417/
- function getTextFromZippedXML($archiveFile, $contentFile) {
- // create zip archive in the memory
- $zip = new ZipArchive;
- // open zip file
- if ($zip->open($archiveFile)) {
- // check the file in the archive
- if (($index = $zip->locateName($contentFile)) !== false) {
- // if found read in text variable
- $content = $zip->getFromIndex($index);
- // close the archive, we don't need it anymore
- $zip->close();
- // sw
- $content=str_replace("<w:p ","\n<w:p ",$content);
- // TODO add all entities and includes
- // skip all errors and warnings
- $xml = DOMDocument::loadXML($content, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
- // return data without wml tags
- return strip_tags($xml->saveXML());
- } else {echo "Not found!";}
- $zip->close();
- }
- // if something wron return ERROR text
- return "ERROR in text Tokenization";
- }
- //
- // functions rtf_isPlainText and rtf2text
- // support RTF files
- //
- // thanks to DonRamon http://habrahabr.ru/blogs/php/70119/
- function rtf_isPlainText($s) {
- $failAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
- for ($i = 0; $i < count($failAt); $i++)
- if (!empty($s[$failAt[$i]])) return false;
- return true;
- }
- function rtf2text($filename) {
- $text = file_get_contents($filename);
- if (!strlen($text))
- return "";
- // start with empty stack of modifiers
- $document = "";
- $stack = array();
- $j = -1;
- // read chars from buffer...
- for ($i = 0, $len = strlen($text); $i < $len; $i++) {
- $c = $text[$i];
- // select what to do with the current char
- switch ($c) {
- // the most important key \
- case "\\":
- // read the next char
- $nc = $text[$i + 1];
- // put into the out stream
- if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
- elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
- elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
- // * goes to stack
- elseif ($nc == '*') $stack[$j]["*"] = true;
- elseif ($nc == "'") {
- $hex = substr($text, $i + 2, 2);
- if (rtf_isPlainText($stack[$j]))
- $document .= html_entity_decode("&#".hexdec($hex).";");
- // move the index
- $i += 2;
- // read the key symbol
- } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
- $word = "";
- $param = null;
- // read after \
- for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
- $nc = $text[$k];
- if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
- if (empty($param))
- $word .= $nc;
- else
- break;
- } elseif ($nc >= '0' && $nc <= '9')
- $param .= $nc;
- elseif ($nc == '-') {
- if (empty($param))
- $param .= $nc;
- else
- break;
- // end
- } else
- break;
- }
- // move the index
- $i += $m - 1;
- // read the word
- $toText = "";
- switch (strtolower($word)) {
- case "u":
- $toText .= html_entity_decode("&#x".dechex($param).";");
- $ucDelta = @$stack[$j]["uc"];
- if ($ucDelta > 0)
- $i += $ucDelta;
- break;
- case "par": case "page": case "column": case "line": case "lbr":
- $toText .= "\n";
- break;
- case "emspace": case "enspace": case "qmspace":
- $toText .= " ";
- break;
- case "tab": $toText .= "\t"; break;
- case "chdate": $toText .= date("m.d.Y"); break;
- case "chdpl": $toText .= date("l, j F Y"); break;
- case "chdpa": $toText .= date("D, j M Y"); break;
- case "chtime": $toText .= date("H:i:s"); break;
- case "emdash": $toText .= html_entity_decode("—"); break;
- case "endash": $toText .= html_entity_decode("–"); break;
- case "bullet": $toText .= html_entity_decode("•"); break;
- case "lquote": $toText .= html_entity_decode("‘"); break;
- case "rquote": $toText .= html_entity_decode("’"); break;
- case "ldblquote": $toText .= html_entity_decode("«"); break;
- case "rdblquote": $toText .= html_entity_decode("»"); break;
- default:
- $stack[$j][strtolower($word)] = empty($param) ? true : $param;
- break;
- }
- if (rtf_isPlainText($stack[$j]))
- $document .= $toText;
- }
- $i++;
- break;
- case "{":
- array_push($stack, $stack[$j++]);
- break;
- // } removes current stack. Group is over.
- case "}":
- array_pop($stack);
- $j--;
- break;
- //
- case '\0': case '\r': case '\f': case '\n': break;
- //
- default:
- if (rtf_isPlainText($stack[$j]))
- $document .= $c;
- break;
- }
- }
- //
- return $document;
- }// end rtf2text
- // Reading text from PDF
- // Версия 0.3
- // Author: Алексей Рембиш a.k.a Ramon
- // E-mail: alex@rembish.ru
- // Copyright 2009
- // Partial translation by Sergey
- function decodeAsciiHex($input) {
- $output = "";
- $isOdd = true;
- $isComment = false;
- for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
- $c = $input[$i];
- if($isComment) {
- if ($c == '\r' || $c == '\n')
- $isComment = false;
- continue;
- }
- switch($c) {
- case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
- case '%':
- $isComment = true;
- break;
- default:
- $code = hexdec($c);
- if($code === 0 && $c != '0')
- return "";
- if($isOdd)
- $codeHigh = $code;
- else
- $output .= chr($codeHigh * 16 + $code);
- $isOdd = !$isOdd;
- break;
- }
- }
- if($input[$i] != '>')
- return "";
- if($isOdd)
- $output .= chr($codeHigh * 16);
- return $output;
- }
- function decodeAscii85($input) {
- $output = "";
- $isComment = false;
- $ords = array();
-
- for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
- $c = $input[$i];
- if($isComment) {
- if ($c == '\r' || $c == '\n')
- $isComment = false;
- continue;
- }
- if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
- continue;
- if ($c == '%') {
- $isComment = true;
- continue;
- }
- if ($c == 'z' && $state === 0) {
- $output .= str_repeat(chr(0), 4);
- continue;
- }
- if ($c < '!' || $c > 'u')
- return "";
- $code = ord($input[$i]) & 0xff;
- $ords[$state++] = $code - ord('!');
- if ($state == 5) {
- $state = 0;
- for ($sum = 0, $j = 0; $j < 5; $j++)
- $sum = $sum * 85 + $ords[$j];
- for ($j = 3; $j >= 0; $j--)
- $output .= chr($sum >> ($j * 8));
- }
- }
- if ($state === 1)
- return "";
- elseif ($state > 1) {
- for ($i = 0, $sum = 0; $i < $state; $i++)
- $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
- for ($i = 0; $i < $state - 1; $i++)
- $output .= chr($sum >> ((3 - $i) * 8));
- }
- return $output;
- }
- function decodeFlate($input) {
- // The most common compression method for data streams in PDF.
- // Very easy to deal with using libraries.
- return @gzuncompress($input);
- }
- function getObjectOptions($object) {
- // We need to get current object attrbutes. These attributes are
- // located between << and >>. Each option starts with /.
- $options = array();
- if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
- // Separate options from each other using /. First empty one should be removed from the array.
- $options = explode("/", $options[1]);
- @array_shift($options);
- // Create handy array for current object attributes
- // Attributs that look like "/Option N" will be written to hash
- // as "Option" => N, and properties like "/Param", will be written as
- // "Param" => true.
- $o = array();
- for ($j = 0; $j < @count($options); $j++) {
- $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
- if (strpos($options[$j], " ") !== false) {
- $parts = explode(" ", $options[$j]);
- $o[$parts[0]] = $parts[1];
- } else
- $o[$options[$j]] = true;
- }
- $options = $o;
- unset($o);
- }
- // Return an array of parameters we found
- return $options;
- }
- function getDecodedStream($stream, $options) {
- // Now we have a stream that is possibly coded with some compression method(s)
- // Lets try to decode it.
- $data = "";
- // If current stream has Filter attribute, then is is definately compressed or en coded
- // Otherwise just return the content
- if (empty($options["Filter"]))
- $data = $stream;
- else {
- // If we know the size of data stream from options then we need to cut the data
- // using this size, or we may not be able to decode it or maybe something else will go wring
- $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
- $_stream = substr($stream, 0, $length);
- // Looping through options looking for indicatiors of data compression in the current stream.
- // PDF supprts many different stuff, but text can be coded either by ASCII Hex, or ASCII 85-base or GZ/Deflate
- // We need to look for these keys and apply respecrtive functions for decoding.
- // There is another option: Crypt, but we are not going to work with encrypted PDF's.
- foreach ($options as $key => $value) {
- if ($key == "ASCIIHexDecode")
- $_stream = decodeAsciiHex($_stream);
- if ($key == "ASCII85Decode")
- $_stream = decodeAscii85($_stream);
- if ($key == "FlateDecode")
- $_stream = decodeFlate($_stream);
- }
- $data = $_stream;
- }
- // Return the result
- return $data;
- }
- function getDirtyTexts(&$texts, $textContainers) {
- // So we have an array of text contatiners that were taken from both BT and ET.
- // Our new task is to find a text in them that would be displayed by viewers
- // on the screen. There are many options to do that, Lets check the pair: [...] TJ and Td (...) Tj
- for ($j = 0; $j < count($textContainers); $j++) {
- // Add the pieces of row data the we found to the general array of text objects.
- if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
- $texts = array_merge($texts, @$parts[1]);
- elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
- $texts = array_merge($texts, @$parts[1]);
- }
- }
- function getCharTransformations(&$transformations, $stream) {
- // Oh Mama Mia! As far as I know nobody did it before. At least not in the open source.
- // We are going to have some fun now - search in symbol transformation streams.
- // Under transforation I mean conversion of ony symbol to hex form or even to some kind of sequence.
- // We need all the attributes that we can find in the current stream.
- // Data between beginbfchar and endbfchar transform one hex-code intn another (or sequence of codes)
- // separately. Between beginbfrange and endbfrange the transformation of data sequences is taking place
- // and it reduces the number of definitions.
- preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
- preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
- // First of all process separate symbols. Transformaiton string looks as follows:
- // - <0123> <abcd> -> 0123 should be transformed to abcd;
- // - <0123> <abcd6789> -> 0123 should be transformed to many symbols (abcd and 6789 in this case)
- for ($j = 0; $j < count($chars); $j++) {
- // There is a number of strings before data list that we are going ot read. We gonna use it later on.
- $count = $chars[$j][1];
- $current = explode("\n", trim($chars[$j][2]));
- // Read data from each string.
- for ($k = 0; $k < $count && $k < count($current); $k++) {
- // Wrute the transformation we just found. Don't forget about writing leading zeros if there are less then 4 digits..
- if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
- $transformations[str_pad($map[1], 4, "0")] = $map[2];
- }
- }
- // Now we can deal with sequences. Manuals are saying that they can be one of two possible types
- // - <0000> <0020> <0a00> -> in this case <0000> will be substituted with <0a00>, <0001> with <0a01> and so on
- // till <0020>, that will be substituted with <0a20>.
- // OR
- // - <0000> <0002> [<abcd> <01234567> <8900>] -> here it works in a bit different way. We need to look how
- // many elemants are located between <0000> and <0002> (its actually three including 0001). After it we assign to each element
- // a corresponding value from [ ]: 0000 -> abcd, 0001 -> 0123 4567, а 0002 -> 8900.
- for ($j = 0; $j < count($ranges); $j++) {
- // We need to cross check the number of elements for transofrmation.
- $count = $ranges[$j][1];
- $current = explode("\n", trim($ranges[$j][2]));
- // Working with each string
- for ($k = 0; $k < $count && $k < count($current); $k++) {
- // This is first type sequence.
- if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
- // Convert data into decimal system: looping will be easier.
- $from = hexdec($map[1]);
- $to = hexdec($map[2]);
- $_from = hexdec($map[3]);
- // We put all the elements from the sequence into transformations array.
- // According to manuals we need also to ass leading zeros if hex-code size is less than 4 symbols.
- for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
- $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
- // Second option.
- } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
- // This is also beginnigna nd end of the sequence. Split data in [ ] by symbols located near to spaces.
- $from = hexdec($map[1]);
- $to = hexdec($map[2]);
- $parts = preg_split("#\s+#", trim($map[3]));
-
- // Loop through data and assign the new values accordingly.
- for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
- $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
- }
- }
- }
- }
- function getTextUsingTransformations($texts, $transformations) {
- // Second phase - getting text out of raw data.
- // In PDF "dirty" text strings may look as follows:
- // - (I love)10(PHP) - in this case text data a re located in (),
- // and 10 is number of spaces.
- // - <01234567> - in this case we deal with 2 symbols represented in HEX:
- // : 0123 and 4567. Substitutions for both should be checked inthe substitution table.
- // - (Hello, \123world!) - here \123 is symbol in octal system and we need to handle it properly.
- // Lets go. We are accumulating text data processign "raw" pieces of text
- $document = "";
- for ($i = 0; $i < count($texts); $i++) {
- // 2 cases are possible: text can be either in <> (hex) or in () (plain).
- $isHex = false;
- $isPlain = false;
- $hex = "";
- $plain = "";
- // scan current piece of text.
- for ($j = 0; $j < strlen($texts[$i]); $j++) {
- // get current char
- $c = $texts[$i][$j];
- // ...and decide what to do with it.
- switch($c) {
- // We have hex data in front of us
- case "<":
- $hex = "";
- $isHex = true;
- break;
- // Hex data are over. Lets parse them.
- case ">":
- // split the string into chunks of 4 chars...
- $hexs = str_split($hex, 4);
- // ...and cheking what we can do with each chunk
- for ($k = 0; $k < count($hexs); $k++) {
- // if there are less then 4 symbols then the manual says that we need to add zeros after them
- $chex = str_pad($hexs[$k], 4, "0");
- // Checking if current hex-code is already in transformations.
- // If this is the case change this piece to the required.
- if (isset($transformations[$chex]))
- $chex = $transformations[$chex];
- // Write a new Unicode symbol into the output .
- $document .= html_entity_decode("&#x".$chex.";");
- }
- // Hex-sata are over. Need to say it.
- $isHex = false;
- break;
- // There is a piece of "plain" text
- case "(":
- $plain = "";
- $isPlain = true;
- break;
- // Well... this piece will be over sometime.
- case ")":
- // Get the text we just got into the output stream.
- $document .= $plain;
- $isPlain = false;
- break;
- // Specail symbol. Lets see what is located after it.
- case "\\":
- $c2 = $texts[$i][$j + 1];
- // If it is \ ot either one of ( or ), then print them as it is.
- if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
- // If it is empty space of EOL then process it.
- elseif ($c2 == "n") $plain .= '\n';
- elseif ($c2 == "r") $plain .= '\r';
- elseif ($c2 == "t") $plain .= '\t';
- elseif ($c2 == "b") $plain .= '\b';
- elseif ($c2 == "f") $plain .= '\f';
- // It might happen that a digit follows after \ . It may be up to 3 of them.
- // They represent sybmol code in octal system. Lets parse them.
- elseif ($c2 >= '0' && $c2 <= '9') {
- // We need 3 digits. No more than 3. Digits only.
- $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
- // Getting the number of characters we already have taken. We need it to shift the position of current char properly.
- $j += strlen($oct) - 1;
- // Put the respective char into "plain" text.
- $plain .= html_entity_decode("&#".octdec($oct).";");
- }
- // We increased the position of current symbol at least by one. Need to inform parser about that.
- $j++;
- break;
- // If we have something else then write current symbol into temporaty hex string (if we had < before),
- default:
- if ($isHex)
- $hex .= $c;
- // or into "plain" string if ( was opeon.
- if ($isPlain)
- $plain .= $c;
- break;
- }
- }
- // Define text blocks by EOL
- $document .= "\n";
- }
- // Return text.
- return $document;
- }
- function pdf2text($filename) {
- // Read from the pdf file into string keeping in mind that file may contain binary streams
- $infile = @file_get_contents($filename, FILE_BINARY);
- if (empty($infile))
- return "";
- // First iteration. We need to get all the text data from file.
- // We'll get only "raw" data after the firs iteration. These data will include positioning,
- // hex entries, etc.
- $transformations = array();
- $texts = array();
- // Get list of all files from pdf file.
- preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
- $objects = @$objects[1];
- // Let start the crawling. Apart fromthe text we can meet some other stuff including fonts.
- for ($i = 0; $i < count($objects); $i++) {
- $currentObject = $objects[$i];
- // Check if there is data stream in the current object.
- // Almost all the time it will be compressed with gzip.
- if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
- $stream = ltrim($stream[1]);
- // Read the attributes of this object. We are looking only
- // for text, so we have to do minimal cuts to improve the speed
- $options = getObjectOptions($currentObject);
- if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
- continue;
- // So, we "may" have text in from of us. Lets decode it from binary file to get the plain text.
- $data = getDecodedStream($stream, $options);
- if (strlen($data)) {
- // We need to find text container in the current stream.
- // If we will be able to get it the raw text we found will be added to the previous findings.
- if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
- $textContainers = @$textContainers[1];
- getDirtyTexts($texts, $textContainers);
- // Otherwise we'll try to use symbol transformations that we gonna use on the 2nd step.
- } else
- getCharTransformations($transformations, $data);
- }
- }
- }
- // After the preliminary parsing of pdf-document we need to parse
- // the text blocks we got in the context of simbolic transformations. Return the result after we done.
- return getTextUsingTransformations($texts, $transformations);
- }
- // Reading WCBFF
- // Version 0.2
- // Author: Алексей Рембиш a.k.a Ramon
- // E-mail: alex@rembish.ru
- // Copyright 2009
- // so my little firends, below you can see class that works with WCBFF (Windows Compound Binary File Format).
- // Why do we need it? This format serves as a basement for such "delicious" formats as .doc, .xls и .ppt.
- // Lets see how it looks like
- class cfb {
- // We gonna read the content of the file we need to decode into this variable.
- protected $data = "";
- // Sizes of FAT sector (1 << 9 = 512), Mini FAT sector (1 << 6 = 64) and maximum size
- // of the stream that could be written into a miniFAT.
- protected $sectorShift = 9;
- protected $miniSectorShift = 6;
- protected $miniSectorCutoff = 4096;
- // FAT-sector sequence array and Array of "files" belonging to this file structure
- protected $fatChains = array();
- protected $fatEntries = array();
- // Array of sequences of Mini FAT-sectors and the whole Mini FAT of our file
- protected $miniFATChains = array();
- protected $miniFAT = "";
- // Version (3 or 4), and way to write numbers (little-endian)
- private $version = 3;
- private $isLittleEndian = true;
- // The number of "files" and the position fo the first "file" in FAT
- private $cDir = 0;
- private $fDir = 0;
- // The number of FAT sectors in the file
- private $cFAT = 0;
- // The number of miniFAT-sectors and position of sequences of miniFAT-сsectors in the file
- private $cMiniFAT = 0;
- private $fMiniFAT = 0;
- // DIFAT: number of such sectors and offset to sector 110 (first 109 sectors are located in the header)
- private $DIFAT = array();
- private $cDIFAT = 0;
- private $fDIFAT = 0;
- // Constants: end of sequence and empty sector (4 bytes each)
- const ENDOFCHAIN = 0xFFFFFFFE;
- const FREESECT = 0xFFFFFFFF;
- // Read the file into internal variable
- public function read($filename) {
- $this->data = file_get_contents($filename);
- }
- public function parse() {
- // First of all we need to check weither we really have CFB in front of us.?
- // To do it we read the first 8 bytes and compare them with 2 patterns: common and the old one
- $abSig = strtoupper(bin2hex(substr($this->data, 0, 8)));
- if ($abSig != "D0CF11E0A1B11AE1" && $abSig != "0E11FC0DD0CF11E0") { return false; }
- // Read the file header;
- $this->readHeader();
- // get the remaining DIFAT sectors if any;
- $this->readDIFAT();
- // read the sequence of FAT sectors
- $this->readFATChains();
- // read the sequence of MiniFAT-sectors
- $this->readMiniFATChains();
- // read the structure of "directories" within the file
- $this->readDirectoryStructure();
- // Finally we need to check the root entry in the file structure.
- // This stream is required ot be in a file at least because it has a link
- // to file's miniFAT that we gonna read into $this->miniFAT
-
- $reStreamID = $this->getStreamIdByName("Root Entry");
- if ($reStreamID === false) { return false; }
- $this->miniFAT = $this->getStreamById($reStreamID, true);
- // Remove the unnecessary link to DIFAT-sectors, we have "stolen" complete FAT sequences instead of them.
- unset($this->DIFAT);
- // After all this we should be able to work with any of the "upper" formats from Microsoft such as doc, xls или ppt.
- }
- // Function that looks for stream number in the directory structure by its name.
- // It returns false if nothing was found.
- public function getStreamIdByName($name) {
- for($i = 0; $i < count($this->fatEntries); $i++) {
- if ($this->fatEntries[$i]["name"] == $name)
- return $i;
- }
- return false;
- }
- // Function gets the stream number ($id) and a second parameter (second perameter is required for the root entry only).
- // It returns the binary content fo this stream.
- public function getStreamById($id, $isRoot = false) {
- $entry = $this->fatEntries[$id];
- // Get the size and offset position to the content of "current" file.
- $from = $entry["start"];
- $size = $entry["size"];
- // Now 2 options are possible: is size is less than 4096 byte, then we need ot read data
- // from MiniFAT. If more than 4096 read from the common FAT. RootEntry is an exclusion:
- // we need ot read contents from FAT as miniFAT is located there.
- $stream = "";
- // So, here is the 1st option: small size and not root.
- if ($size < $this->miniSectorCutoff && !$isRoot) {
- // Get the miniFAT sector size - 64 bytes
- $ssize = 1 << $this->miniSectorShift;
- do {
- // Get the offset in miniFAT
- $start = $from << $this->miniSectorShift;
- // Read miniFAT-sector
- $stream .= substr($this->miniFAT, $start, $ssize);
- // Get the next piece of miniFAT in the array of chains
- $from = $this->miniFATChains[$from];
- // While not end of chain (sequence).
- } while ($from != self::ENDOFCHAIN);
- } else {
- // Second option - large piece - read it from FAT.
- // Get the sector size - 512 (or 4096 for new versions)
- $ssize = 1 << $this->sectorShift;
-
- do {
- // Getting the offset in the file (taking into account that there is a header of 512 bytes in the begining)
- $start = ($from + 1) << $this->sectorShift;
- // Read a sector
- $stream .= substr($this->data, $start, $ssize);
- // Get the next sector inthe array of FAT chains
- $from = $this->fatChains[$from];
- // While not end of chain (sequence).
- } while ($from != self::ENDOFCHAIN);
- }
- // Return the stream content accrding to its size.
- return substr($stream, 0, $size);
- }
- // This function reads data from file header
- private function readHeader() {
- // We need to get the information about the data format in the file
- $uByteOrder = strtoupper(bin2hex(substr($this->data, 0x1C, 2)));
- // We need to check if it is little-endian record
- $this->isLittleEndian = $uByteOrder == "FEFF";
- // Version 3 or 4 (never actually met 4th, but its description appears in the manual)
- $this->version = $this->getShort(0x1A);
- // Offsets for FAT and miniFAT
- $this->sectorShift = $this->getShort(0x1E);
- $this->miniSectorShift = $this->getShort(0x20);
- $this->miniSectorCutoff = $this->getLong(0x38);
- // Number of entries in the directory and offset to the first description in the file
- if ($this->version == 4)
- $this->cDir = $this->getLong(0x28);
- $this->fDir = $this->getLong(0x30);
- // Number of FAT sectors in the file
- $this->cFAT = $this->getLong(0x2C);
- // Number and position of hte 1st miniFAT-sector of sequences.
- $this->cMiniFAT = $this->getLong(0x40);
- $this->fMiniFAT = $this->getLong(0x3C);
- // Where are the FAT sector chains and how many of them are there.
- $this->cDIFAT = $this->getLong(0x48);
- $this->fDIFAT = $this->getLong(0x44);
- }
- // So.... DIFAT. DIFAT shows in which sectors we can find descriptions of FAT sector chains
- // Without these chains we won't be able to get stream contents in fragmented files
- private function readDIFAT() {
- $this->DIFAT = array();
- // First 109 links to sequences are being stored in the header of our file
- for ($i = 0; $i < 109; $i++)
- $this->DIFAT[$i] = $this->getLong(0x4C + $i * 4);
- // we also check if there are other links to chains. in small (upto 8.5MB) there is no such
- // links but in larger files we have to read them.
- if ($this->fDIFAT != self::ENDOFCHAIN) {
- // Sector size and start position to read links.
- $size = 1 << $this->sectorShift;
- $from = $this->fDIFAT;
- $j = 0;
- do {
- // Get the position in the file considering header
- $start = ($from + 1) << $this->sectorShift;
- // Read the links to sequences' sectors
- for ($i = 0; $i < ($size - 4); $i += 4)
- $this->DIFAT[] = $this->getLong($start + $i);
- // Getting the next DIFAT-sector. Link to this sector is written
- // as the last "word" in the current DIFAT-sector
- $from = $this->getLong($start + $i);
- // Ef sector exists we need to move there
- } while ($from != self::ENDOFCHAIN && ++$j < $this->cDIFAT);
- }
- // Remove the unnecessary links.
- while($this->DIFAT[count($this->DIFAT) - 1] == self::FREESECT)
- array_pop($this->DIFAT);
- }
- // So, we done with reading DIFAT. Now chains of FAT sectors should be converted
- // Lets go further.
- private function readFATChains() {
- // Sector size
- $size = 1 << $this->sectorShift;
- $this->fatChains = array();
- // Going through DIFAT array.
- for ($i = 0; $i < count($this->DIFAT); $i++) {
- // Go to the sector that we were looking for (with the header)
- $from = ($this->DIFAT[$i] + 1) << $this->sectorShift;
- // Getting the FAT chain: array index is a current sector,
- // value from an array s index of the next element or
- // ENDOFCHAIN - if it is last element in the chain.
- for ($j = 0; $j < $size; $j += 4)
- $this->fatChains[] = $this->getLong($from + $j);
- }
- }
- // We done with reading of FAT sequences. Now heed to read MiniFAT-sequences exaactly the same way.
- private function readMiniFATChains() {
- // Sector size
- $size = 1 << $this->sectorShift;
- $this->miniFATChains = array();
- // Looking for the first sector with MiniFAT- sequences
- $from = $this->fMiniFAT;
- // If MiniFAT appears to be in file then
- while ($from != self::ENDOFCHAIN) {
- // Looking for the offset to the sector with MiniFat-sequence
- $start = ($from + 1) << $this->sectorShift;
- // Read the sequence from the current sector
- for ($i = 0; $i < $size; $i += 4)
- $this->miniFATChains[] = $this->getLong($start + $i);
- // If this is notthe last sector in the chain we need to move forward
- $from = $this->fatChains[$from];
- }
- }
- // The most important functions that reads structure of "files" of such a type
- // All the FS objects are written into this structure.
- private function readDirectoryStructure() {
- // get the 1st sector with "files" in file system
- $from = $this->fDir;
- // Get the sector size
- $size = 1 << $this->sectorShift;
- $this->fatEntries = array();
- do {
- // get sector in the file
- $start = ($from + 1) << $this->sectorShift;
- // Let go through the content of this sector. One sector contains up to 4 (or 128 for version 4)
- // entries to FS. Lets read them.
- for ($i = 0; $i < $size; $i += 128) {
- // Get the binary data
- $entry = substr($this->data, $start + $i, 128);
- // and prcess these data:
- $this->fatEntries[] = array(
- // get the entry name
- "name" => $this->utf16_to_ansi(substr($entry, 0, $this->getShort(0x40, $entry))),
- // and its type: either stream, or user data, or empty sector, etc.
- "type" => ord($entry[0x42]),
- // its color in the Red-Black tree
- "color" => ord($entry[0x43]),
- // its "left" siblings
- "left" => $this->getLong(0x44, $entry),
- // its "right" siblings
- "right" => $this->getLong(0x48, $entry),
- // its child
- "child" => $this->getLong(0x4C, $entry),
- // offset to the content in FAT or miniFAT
- "start" => $this->getLong(0x74, $entry),
- // size of the content
- "size" => $this->getSomeBytes($entry, 0x78, 8),
- );
- }
- // get the next sector with descriptions and jump there
- $from = $this->fatChains[$from];
- // Of course if such a sector exists
- } while ($from != self::ENDOFCHAIN);
- // remove "empty" entries at the end if any.
- while($this->fatEntries[count($this->fatEntries) - 1]["type"] == 0)
- array_pop($this->fatEntries);
- }
- // Support function to get the adequate name of the current entrie in FS.
- // Note: names are written in the Unicode.
- private function utf16_to_ansi($in) {
- $out = "";
- for ($i = 0; $i < strlen($in); $i += 2)
- $out .= chr($this->getShort($i, $in));
- return trim($out);
- }
- protected function unicode_to_utf8($in, $check = false) {
- $out = "";
- if ($check && strpos($in, chr(0)) !== 1) {
- while (($i = strpos($in, chr(0x13))) !== false) {
- $j = strpos($in, chr(0x15), $i + 1);
- if ($j === false)
- break;
- $in = substr_replace($in, "", $i, $j - $i);
- }
- for ($i = 0; $i < strlen($in); $i++) {
- if (ord($in[$i]) >= 32) {}
- elseif ($in[$i] == ' ' || $in[$i] == '\n') {}
- else
- $in = substr_replace($in, "", $i, 1);
- }
- $in = str_replace(chr(0), "", $in);
- return $in;
- } elseif ($check) {
- while (($i = strpos($in, chr(0x13).chr(0))) !== false) {
- $j = strpos($in, chr(0x15).chr(0), $i + 1);
- if ($j === false)
- break;
- $in = substr_replace($in, "", $i, $j - $i);
- }
- $in = str_replace(chr(0).chr(0), "", $in);
- }
- // Loop thriugh 2 byte words
- $skip = false;
- for ($i = 0; $i < strlen($in); $i += 2) {
- $cd = substr($in, $i, 2);
- if ($skip) {
- if (ord($cd[1]) == 0x15 || ord($cd[0]) == 0x15)
- $skip = false;
- continue;
- }
- // If upper byte is 0 then this is ANSI
- if (ord($cd[1]) == 0) {
- // If ASCII value is higher than 32 we will write it as it is.
- if (ord($cd[0]) >= 32)
- $out .= $cd[0];
- elseif ($cd[0] == ' ' || $cd[0] == '\n')
- $out .= $cd[0];
- elseif (ord($cd[0]) == 0x13)
- $skip = true;
- else {
- continue;
- // В противном случае проверяем символы на внедрённые команды (список можно
- // дополнить и пополнить).
- switch (ord($cd[0])) {
- case 0x0D: case 0x07: $out .= "\n"; break;
- case 0x08: case 0x01: $out .= ""; break;
- case 0x13: $out .= "HYPER13"; break;
- case 0x14: $out .= "HYPER14"; break;
- case 0x15: $out .= "HYPER15"; break;
- default: $out .= " "; break;
- }
- }
- } else { // Иначе преобразовываем в HTML entity
- if (ord($cd[1]) == 0x13) {
- echo "@";
- $skip = true;
- continue;
- }
- $out .= "&#x".sprintf("%04x", $this->getShort(0, $cd)).";";
- }
- }
- // and return the results
- return $out;
- }
- // Support function to geto some bytes from the string
- // taking into account order of bytes and converting values into a number.
- protected function getSomeBytes($data, $from, $count) {
- // Read data from $data by default.
- if ($data === null)
- $data = $this->data;
- // Read a piece
- $string = substr($data, $from, $count);
- // in case of backward order reverse it
- if ($this->isLittleEndian)
- $string = strrev($string);
- // encode from binary to hex and to a number.
- return hexdec(bin2hex($string));
- }
- // Read a word from the variable (by default from this->data)
- protected function getShort($from, $data = null) {
- return $this->getSomeBytes($data, $from, 2);
- }
- // read a double word from the variable (by default from this->data)
- protected function getLong($from, $data = null) {
- return $this->getSomeBytes($data, $from, 4);
- }
- }
- // Reading text from DOC
- // Версия 0.4
- // Author: Алексей Рембиш a.k.a Ramon
- // E-mail:
- // Copyright 2009
- // Comments translated by Sergey Butakov
- // Class to work with Microsoft Word Document (or just doc). It extends
- // Windows Compound Binary File Format. Lets try to find text here
- class doc extends cfb {
- // This function extends parse funciton and returns text from the file.
- // If returns flase if something went wrong.
- public function parse() {
- parent::parse();
- // To read a DOC file we need 2 streams - WordDocument and 0Table or
- // 1Table depending on the situation. Lets get hte first stream.
- // It contains pieces of text we need to collect.
- $wdStreamID = $this->getStreamIdByName("WordDocument");
- if ($wdStreamID === false) { return false; }
- // We got the stream. Lets read it into a variable
- $wdStream = $this->getStreamById($wdStreamID);
- // Next we need to get something from FIB - special block named
- // File Information Block that is located in the beginning of WordDocument stream.
- $bytes = $this->getShort(0x000A, $wdStream);
-
- // Read which table we need to read: number 0 or number 1.
- // To do so we need to read a small bit from the header.
- $fWhichTblStm = ($bytes & 0x0200) == 0x0200;
- //Now we need to get the position of CLX in the table stream. And the size of CLX itself.
- $fcClx = $this->getLong(0x01A2, $wdStream);
- $lcbClx = $this->getLong(0x01A6, $wdStream);
- // Conting few values to separate positions from the size in clx
- $ccpText = $this->getLong(0x004C, $wdStream);
- $ccpFtn = $this->getLong(0x0050, $wdStream);
- $ccpHdd = $this->getLong(0x0054, $wdStream);
- $ccpMcr = $this->getLong(0x0058, $wdStream);
- $ccpAtn = $this->getLong(0x005C, $wdStream);
- $ccpEdn = $this->getLong(0x0060, $wdStream);
- $ccpTxbx = $this->getLong(0x0064, $wdStream);
- $ccpHdrTxbx = $this->getLong(0x0068, $wdStream);
- // Using the value that we just got we can look for the value of the last CP - character position
- $lastCP = $ccpFtn + $ccpHdd + $ccpMcr + $ccpAtn + $ccpEdn + $ccpTxbx + $ccpHdrTxbx;
- $lastCP += ($lastCP != 0) + $ccpText;
- // Get the required table in the file.
- $tStreamID = $this->getStreamIdByName(intval($fWhichTblStm)."Table");
- if ($tStreamID === false) { return false; }
- // And read the stream to a variable
- $tStream = $this->getStreamById($tStreamID);
- // Потом находим в потоке CLX
- $clx = substr($tStream, $fcClx, $lcbClx);
- // Now we need to go through CLX (yes... its complex) looking for piece with offsets and sizes of text pieces
- $lcbPieceTable = 0;
- $pieceTable = "";
- // Well... this is the most exciting part. There is not too much of documentation on the web site about
- // what can be found before pieceTable in the CLX. So we will do the total search looking
- // for the possible beginning of pieceTable (it must start with 0х02), and read the following 4 bytes
- // - size of pieceTable. If the actual size equial to size writtent in the offset then Bingo! we found pieceTable.
- // If not continue the search.
- $from = 0;
- // Looking for 0х02 in CLX starting from the current offset
- while (($i = strpos($clx, chr(0x02), $from)) !== false) {
- // Get the pieceTable size
- $lcbPieceTable = $this->getLong($i + 1, $clx);
- // Get the pieceTable
- $pieceTable = substr($clx, $i + 5);
- // If the real size differs from required then this is not what we are lloking for.
- // Skip it.
- if (strlen($pieceTable) != $lcbPieceTable) {
- $from = $i + 1;
- continue;
- }
- // Oh.... we got it!!! its break time my littel friends!
- break;
- }
- // Now we need to fill the array of character positions, until we got the last CP.
- $cp = array(); $i = 0;
- while (($cp[] = $this->getLong($i, $pieceTable)) != $lastCP)
- $i += 4;
- // The rest will go as PCD (piece descriptors)
- $pcd = str_split(substr($pieceTable, $i + 4), 8);
- $text = "";
- // Yes! we came to our main goal - reading text from file.
- // Go through the descriptors of such pieces
- for ($i = 0; $i < count($pcd); $i++) {
- // Get the word with offset and compression flag
- $fcValue = $this->getLong(2, $pcd[$i]);
- // Check what do we have: simple ANSI or Unicode
- $isANSI = ($fcValue & 0x40000000) == 0x40000000;
- // The rest without top will go as an offset
- $fc = $fcValue & 0x3FFFFFFF;
- // Get the piece of text
- $lcb = $cp[$i + 1] - $cp[$i];
- // if htis is Unicode, then lets read twice more bytes.
- if (!$isANSI)
- $lcb *= 2;
- // If ANSI - start twice earlier.
- else
- $fc /= 2;
- // Read a piece from Worddocument stream considering the offset
- $part = substr($wdStream, $fc, $lcb);
- // If this is a Unicode text then decode it to the regular text
- if (!$isANSI)
- $part = $this->unicode_to_utf8($part);
- // add a piece
- $text .= $part;
- }
- // Remove entries with embedded objects from the file
- $text = preg_replace("/HYPER13 *(INCLUDEPICTURE|HTMLCONTROL)(.*)HYPER15/iU", "", $text);
- $text = preg_replace("/HYPER13(.*)HYPER14(.*)HYPER15/iU", "$2", $text);
- // Return the results
- return $text;
- }
- // Function to convert from Unicode to UTF8
- protected function unicode_to_utf8($in) {
- $out = "";
- // Loop through 2-byte sequences
- for ($i = 0; $i < strlen($in); $i += 2) {
- $cd = substr($in, $i, 2);
- // If the first byte is 0 then this is ANSI
- if (ord($cd[1]) == 0) {
- // If ASCII value of the low byte is higher than 32 then write it as it is.
- if (ord($cd[0]) >= 32)
- $out .= $cd[0];
- // Otherwise check symbols against embedded commands. Please extend the list ;)
- switch (ord($cd[0])) {
- case 0x0D: case 0x07: $out .= "\n"; break;
- case 0x08: case 0x01: $out .= ""; break;
- case 0x13: $out .= "HYPER13"; break;
- case 0x14: $out .= "HYPER14"; break;
- case 0x15: $out .= "HYPER15"; break;
- }
- } else // Otherwise convert to HTML entity
- $out .= html_entity_decode("&#x".sprintf("%04x", $this->getShort(0, $cd)).";");
- }
- // And... return the result
- return $out;
- }
- }
- // Function to convert doc to plain-text. For those who "don't need classes".
- function doc2text($filename) {
- $doc = new doc;
- $doc->read($filename);
- return $doc->parse();
- }
- // Reading text from PPT
- // Version 0.3
- // Auhtor: Алексей Рембиш a.k.a Ramon
- // E-mail: alex@rembish.ru
- // Copyright 2009
- // Comments translated by Sergey
- class ppt extends cfb {
- public function parse() {
- parent::parse();
- // File must have Current User stream.
- $cuStreamID = $this->getStreamIdByName("Current User");
- if ($cuStreamID === false) { return false; }
- // Get this stream and check hash (do we really have PowerPoint-presentation?)
- // and read the offset to the first ocurence of UserEditAtom
- $cuStream = $this->getStreamById($cuStreamID);
- if ($this->getLong(12, $cuStream) == 0xF3D1C4DF) { return false; }
- $offsetToCurrentEdit = $this->getLong(16, $cuStream);
- // Getting stream named PowerPoint Document.
- $ppdStreamID = $this->getStreamIdByName("PowerPoint Document");
- if ($ppdStreamID === false) { return false; }
- $ppdStream = $this->getStreamById($ppdStreamID);
- // Look for all UserEditAtoms in PPT document. We need UserEditAtoms to get offsets to PersistDirectory.
- $offsetLastEdit = $offsetToCurrentEdit;
- $persistDirEntry = array();
- $live = null;
- $offsetPersistDirectory = array();
- do {
- $userEditAtom = $this->getRecord($ppdStream, $offsetLastEdit, 0x0FF5);
- $live = &$userEditAtom;
- array_unshift($offsetPersistDi…
Large files files are truncated, but you can click here to view the full file