LuceneUtils.php | searchcode

/modules/AOD_Index/LuceneUtils.php

https://gitlab.com/tjaafar/SuiteCRM
PHP | 320 lines | 221 code | 18 blank | 81 comment | 16 complexity | 36ef4375bb0756c50e81c0b4371baac6 MD5 | raw file

<?php
 /**
 * 
 * 
 * @package 
 * @copyright SalesAgility Ltd http://www.salesagility.com
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE
 * along with this program; if not, see http://www.gnu.org/licenses
 * or write to the Free Software Foundation,Inc., 51 Franklin Street,
 * Fifth Floor, Boston, MA 02110-1301  USA
 *
 * @author Salesagility Ltd <support@salesagility.com>
 */



function requireLucene(){
    set_include_path(get_include_path() . PATH_SEPARATOR . "modules/AOD_Index/Lib");
    require_once('Zend/Search/Lucene.php');
}

function getDocumentRevisionPath($revisionId){
    return "upload/$revisionId";
}

/**
 * Given a path to a PPTX document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createPPTXDocument($path){
    $doc = Zend_Search_Lucene_Document_Pptx::loadPptxFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}

/**
 * Given a path to a XLSX document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createXLSXDocument($path){
    $doc = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}
/**
 * Given a path to a HTML document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createHTMLDocument($path){
    $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}
/**
 * Given a path to a DocX document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createDocXDocument($path){
    $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}

/**
 * Given a path to a Doc document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createDocDocument($path){
    $fileHandle = fopen($path, "r");
    $line = @fread($fileHandle, filesize($path));
    $lines = explode(chr(0x0D),$line);
    $outtext = "";
    foreach($lines as $thisline)
    {
        $pos = strpos($thisline, chr(0x00));
        if (($pos !== FALSE)||(strlen($thisline)==0))
        {
        } else {
            $outtext .= $thisline." ";
        }
    }
    $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);

    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $outtext));
    fclose($fileHandle);
    return $doc;
}

/**
 * Given a path to a PDF document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createPDFDocument($path){
    require_once('PdfParser.php');
    $text = PdfParser::parseFile($path);
    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $text));
    return $doc;
}

/**
 * Given a path to an ODT doc returns a lucene document with contents and filename set.
 * @param $path
 * @return bool|Zend_Search_Lucene_Document
 */
function createOdtDocument($path){
    if(!is_file($path)){
        return false;
    }
    $doc = new Zend_Search_Lucene_Document();
    $documentBody = array();
    $coreProperties = array();
    $package = new ZipArchive();
    $package->open($path);
    $contents = simplexml_load_string($package->getFromName("content.xml"));
    $paragraphs = $contents->xpath('//text:*');
    foreach ($paragraphs as $paragraph) {
        $documentBody[] = (string)$paragraph;
        $documentBody[] = ' ';
    }
    // Close file
    $package->close();
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', implode(' ', $documentBody), 'UTF-8'));
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}

/**
 * Given a path to a plain text doc returns a lucene document with $filename and $contents set appropriately.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createTextDocument($path){
    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', file_get_contents($path)));
    return $doc;
}


/**
 * Given the path to an rtf document returns a lucene document with $filename and $contents set appropriately.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createRTFDocument($path){
    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $contents = rtf2text($path);
    //print_r($contents);
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $contents));
    return $doc;
}

function rtf_isPlainText($s) {
    $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
    for ($i = 0; $i < count($arrfailAt); $i++)
        if (!empty($s[$arrfailAt[$i]])) return false;
    return true;
}

function rtf2text($filename) {
    // Read the data from the input file.
    $text = file_get_contents($filename);
    if (!strlen($text))
        return "";

    // Create empty stack array.
    $document = "";
    $stack = array();
    $j = -1;
    // Read the data character-by- character…
    for ($i = 0, $len = strlen($text); $i < $len; $i++) {
        $c = $text[$i];

        // Depending on current character select the further actions.
        switch ($c) {
            // the most important key word backslash
            case "\\":
                // read next character
                $nc = $text[$i + 1];

                // If it is another backslash or nonbreaking space or hyphen,
                // then the character is plain text and add it to the output stream.
                if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
                elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
                elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
                // If it is an asterisk mark, add it to the stack.
                elseif ($nc == '*') $stack[$j]["*"] = true;
                // If it is a single quote, read next two characters that are the hexadecimal notation
                // of a character we should add to the output stream.
                elseif ($nc == "'") {
                    $hex = substr($text, $i + 2, 2);
                    if (rtf_isPlainText($stack[$j]))
                        $document .= html_entity_decode("&#".hexdec($hex).";");
                    //Shift the pointer.
                    $i += 2;
                    // Since, we’ve found the alphabetic character, the next characters are control word
                    // and, possibly, some digit parameter.
                } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                    $word = "";
                    $param = null;

                    // Start reading characters after the backslash.
                    for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
                        $nc = $text[$k];
                        // If the current character is a letter and there were no digits before it,
                        // then we’re still reading the control word. If there were digits, we should stop
                        // since we reach the end of the control word.
                        if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
                            if (empty($param))
                                $word .= $nc;
                            else
                                break;
                            // If it is a digit, store the parameter.
                        } elseif ($nc >= '0' && $nc <= '9')
                            $param .= $nc;
                        // Since minus sign may occur only before a digit parameter, check whether
                        // $param is empty. Otherwise, we reach the end of the control word.
                        elseif ($nc == '-') {
                            if (empty($param))
                                $param .= $nc;
                            else
                                break;
                        } else
                            break;
                    }
                    // Shift the pointer on the number of read characters.
                    $i += $m - 1;

                    // Start analyzing what we’ve read. We are interested mostly in control words.
                    $toText = "";
                    switch (strtolower($word)) {
                        // If the control word is "u", then its parameter is the decimal notation of the
                        // Unicode character that should be added to the output stream.
                        // We need to check whether the stack contains \ucN control word. If it does,
                        // we should remove the N characters from the output stream.
                        case "u":
                            $toText .= html_entity_decode("&#x".dechex($param).";");
                            $ucDelta = @$stack[$j]["uc"];
                            if ($ucDelta > 0)
                                $i += $ucDelta;
                            break;
                        // Select line feeds, spaces and tabs.
                        case "par": case "page": case "column": case "line": case "lbr":
                        $toText .= "\n";
                        break;
                        case "emspace": case "enspace": case "qmspace":
                        $toText .= " ";
                        break;
                        case "tab": $toText .= "\t"; break;
                        // Add current date and time instead of corresponding labels.
                        case "chdate": $toText .= date("m.d.Y"); break;
                        case "chdpl": $toText .= date("l, j F Y"); break;
                        case "chdpa": $toText .= date("D, j M Y"); break;
                        case "chtime": $toText .= date("H:i:s"); break;
                        // Replace some reserved characters to their html analogs.
                        case "emdash": $toText .= html_entity_decode("&mdash;"); break;
                        case "endash": $toText .= html_entity_decode("&ndash;"); break;
                        case "bullet": $toText .= html_entity_decode("&#149;"); break;
                        case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
                        case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
                        case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
                        case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
                        // Add all other to the control words stack. If a control word
                        // does not include parameters, set &param to true.
                        default:
                            $stack[$j][strtolower($word)] = empty($param) ? true : $param;
                            break;
                    }
                    // Add data to the output stream if required.
                    if (rtf_isPlainText($stack[$j]))
                        $document .= $toText;
                }

                $i++;
                break;
            // If we read the opening brace {, then new subgroup starts and we add
            // new array stack element and write the data from previous stack element to it.
            case "{":
                array_push($stack, $stack[$j++]);
                break;
            // If we read the closing brace }, then we reach the end of subgroup and should remove
            // the last stack element.
            case "}":
                array_pop($stack);
                $j--;
                break;
            // Skip “trash”.
            case '\0': case '\r': case '\f': case '\n': break;
            // Add other data to the output stream if required.
            default:
                if (rtf_isPlainText($stack[$j]))
                    $document .= $c;
                break;
        }
    }
    // Return result.
    return $document;
}