/modules/AOD_Index/LuceneUtils.php
PHP | 320 lines | 221 code | 18 blank | 81 comment | 16 complexity | 36ef4375bb0756c50e81c0b4371baac6 MD5 | raw file
- <?php
- /**
- *
- *
- * @package
- * @copyright SalesAgility Ltd http://www.salesagility.com
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE
- * along with this program; if not, see http://www.gnu.org/licenses
- * or write to the Free Software Foundation,Inc., 51 Franklin Street,
- * Fifth Floor, Boston, MA 02110-1301 USA
- *
- * @author Salesagility Ltd <support@salesagility.com>
- */
- function requireLucene(){
- set_include_path(get_include_path() . PATH_SEPARATOR . "modules/AOD_Index/Lib");
- require_once('Zend/Search/Lucene.php');
- }
- function getDocumentRevisionPath($revisionId){
- return "upload/$revisionId";
- }
- /**
- * Given a path to a PPTX document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createPPTXDocument($path){
- $doc = Zend_Search_Lucene_Document_Pptx::loadPptxFile($path);
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- return $doc;
- }
- /**
- * Given a path to a XLSX document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createXLSXDocument($path){
- $doc = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($path);
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- return $doc;
- }
- /**
- * Given a path to a HTML document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createHTMLDocument($path){
- $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path);
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- return $doc;
- }
- /**
- * Given a path to a DocX document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createDocXDocument($path){
- $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($path);
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- return $doc;
- }
- /**
- * Given a path to a Doc document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createDocDocument($path){
- $fileHandle = fopen($path, "r");
- $line = @fread($fileHandle, filesize($path));
- $lines = explode(chr(0x0D),$line);
- $outtext = "";
- foreach($lines as $thisline)
- {
- $pos = strpos($thisline, chr(0x00));
- if (($pos !== FALSE)||(strlen($thisline)==0))
- {
- } else {
- $outtext .= $thisline." ";
- }
- }
- $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);
- $doc = new Zend_Search_Lucene_Document();
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $outtext));
- fclose($fileHandle);
- return $doc;
- }
- /**
- * Given a path to a PDF document returns a lucene document with filename and contents set.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createPDFDocument($path){
- require_once('PdfParser.php');
- $text = PdfParser::parseFile($path);
- $doc = new Zend_Search_Lucene_Document();
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $text));
- return $doc;
- }
- /**
- * Given a path to an ODT doc returns a lucene document with contents and filename set.
- * @param $path
- * @return bool|Zend_Search_Lucene_Document
- */
- function createOdtDocument($path){
- if(!is_file($path)){
- return false;
- }
- $doc = new Zend_Search_Lucene_Document();
- $documentBody = array();
- $coreProperties = array();
- $package = new ZipArchive();
- $package->open($path);
- $contents = simplexml_load_string($package->getFromName("content.xml"));
- $paragraphs = $contents->xpath('//text:*');
- foreach ($paragraphs as $paragraph) {
- $documentBody[] = (string)$paragraph;
- $documentBody[] = ' ';
- }
- // Close file
- $package->close();
- $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', implode(' ', $documentBody), 'UTF-8'));
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- return $doc;
- }
- /**
- * Given a path to a plain text doc returns a lucene document with $filename and $contents set appropriately.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createTextDocument($path){
- $doc = new Zend_Search_Lucene_Document();
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', file_get_contents($path)));
- return $doc;
- }
- /**
- * Given the path to an rtf document returns a lucene document with $filename and $contents set appropriately.
- * @param $path
- * @return Zend_Search_Lucene_Document
- */
- function createRTFDocument($path){
- $doc = new Zend_Search_Lucene_Document();
- $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
- $contents = rtf2text($path);
- //print_r($contents);
- $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $contents));
- return $doc;
- }
- function rtf_isPlainText($s) {
- $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
- for ($i = 0; $i < count($arrfailAt); $i++)
- if (!empty($s[$arrfailAt[$i]])) return false;
- return true;
- }
- function rtf2text($filename) {
- // Read the data from the input file.
- $text = file_get_contents($filename);
- if (!strlen($text))
- return "";
- // Create empty stack array.
- $document = "";
- $stack = array();
- $j = -1;
- // Read the data character-by- character…
- for ($i = 0, $len = strlen($text); $i < $len; $i++) {
- $c = $text[$i];
- // Depending on current character select the further actions.
- switch ($c) {
- // the most important key word backslash
- case "\\":
- // read next character
- $nc = $text[$i + 1];
- // If it is another backslash or nonbreaking space or hyphen,
- // then the character is plain text and add it to the output stream.
- if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
- elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
- elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
- // If it is an asterisk mark, add it to the stack.
- elseif ($nc == '*') $stack[$j]["*"] = true;
- // If it is a single quote, read next two characters that are the hexadecimal notation
- // of a character we should add to the output stream.
- elseif ($nc == "'") {
- $hex = substr($text, $i + 2, 2);
- if (rtf_isPlainText($stack[$j]))
- $document .= html_entity_decode("&#".hexdec($hex).";");
- //Shift the pointer.
- $i += 2;
- // Since, we’ve found the alphabetic character, the next characters are control word
- // and, possibly, some digit parameter.
- } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
- $word = "";
- $param = null;
- // Start reading characters after the backslash.
- for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
- $nc = $text[$k];
- // If the current character is a letter and there were no digits before it,
- // then we’re still reading the control word. If there were digits, we should stop
- // since we reach the end of the control word.
- if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
- if (empty($param))
- $word .= $nc;
- else
- break;
- // If it is a digit, store the parameter.
- } elseif ($nc >= '0' && $nc <= '9')
- $param .= $nc;
- // Since minus sign may occur only before a digit parameter, check whether
- // $param is empty. Otherwise, we reach the end of the control word.
- elseif ($nc == '-') {
- if (empty($param))
- $param .= $nc;
- else
- break;
- } else
- break;
- }
- // Shift the pointer on the number of read characters.
- $i += $m - 1;
- // Start analyzing what we’ve read. We are interested mostly in control words.
- $toText = "";
- switch (strtolower($word)) {
- // If the control word is "u", then its parameter is the decimal notation of the
- // Unicode character that should be added to the output stream.
- // We need to check whether the stack contains \ucN control word. If it does,
- // we should remove the N characters from the output stream.
- case "u":
- $toText .= html_entity_decode("&#x".dechex($param).";");
- $ucDelta = @$stack[$j]["uc"];
- if ($ucDelta > 0)
- $i += $ucDelta;
- break;
- // Select line feeds, spaces and tabs.
- case "par": case "page": case "column": case "line": case "lbr":
- $toText .= "\n";
- break;
- case "emspace": case "enspace": case "qmspace":
- $toText .= " ";
- break;
- case "tab": $toText .= "\t"; break;
- // Add current date and time instead of corresponding labels.
- case "chdate": $toText .= date("m.d.Y"); break;
- case "chdpl": $toText .= date("l, j F Y"); break;
- case "chdpa": $toText .= date("D, j M Y"); break;
- case "chtime": $toText .= date("H:i:s"); break;
- // Replace some reserved characters to their html analogs.
- case "emdash": $toText .= html_entity_decode("—"); break;
- case "endash": $toText .= html_entity_decode("–"); break;
- case "bullet": $toText .= html_entity_decode("•"); break;
- case "lquote": $toText .= html_entity_decode("‘"); break;
- case "rquote": $toText .= html_entity_decode("’"); break;
- case "ldblquote": $toText .= html_entity_decode("«"); break;
- case "rdblquote": $toText .= html_entity_decode("»"); break;
- // Add all other to the control words stack. If a control word
- // does not include parameters, set ¶m to true.
- default:
- $stack[$j][strtolower($word)] = empty($param) ? true : $param;
- break;
- }
- // Add data to the output stream if required.
- if (rtf_isPlainText($stack[$j]))
- $document .= $toText;
- }
- $i++;
- break;
- // If we read the opening brace {, then new subgroup starts and we add
- // new array stack element and write the data from previous stack element to it.
- case "{":
- array_push($stack, $stack[$j++]);
- break;
- // If we read the closing brace }, then we reach the end of subgroup and should remove
- // the last stack element.
- case "}":
- array_pop($stack);
- $j--;
- break;
- // Skip “trash”.
- case '\0': case '\r': case '\f': case '\n': break;
- // Add other data to the output stream if required.
- default:
- if (rtf_isPlainText($stack[$j]))
- $document .= $c;
- break;
- }
- }
- // Return result.
- return $document;
- }