PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/modules/AOD_Index/LuceneUtils.php

https://gitlab.com/tjaafar/SuiteCRM
PHP | 320 lines | 221 code | 18 blank | 81 comment | 16 complexity | 36ef4375bb0756c50e81c0b4371baac6 MD5 | raw file
  1. <?php
  2. /**
  3. *
  4. *
  5. * @package
  6. * @copyright SalesAgility Ltd http://www.salesagility.com
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by
  10. * the Free Software Foundation; either version 3 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE
  19. * along with this program; if not, see http://www.gnu.org/licenses
  20. * or write to the Free Software Foundation,Inc., 51 Franklin Street,
  21. * Fifth Floor, Boston, MA 02110-1301 USA
  22. *
  23. * @author Salesagility Ltd <support@salesagility.com>
  24. */
  25. function requireLucene(){
  26. set_include_path(get_include_path() . PATH_SEPARATOR . "modules/AOD_Index/Lib");
  27. require_once('Zend/Search/Lucene.php');
  28. }
  29. function getDocumentRevisionPath($revisionId){
  30. return "upload/$revisionId";
  31. }
  32. /**
  33. * Given a path to a PPTX document returns a lucene document with filename and contents set.
  34. * @param $path
  35. * @return Zend_Search_Lucene_Document
  36. */
  37. function createPPTXDocument($path){
  38. $doc = Zend_Search_Lucene_Document_Pptx::loadPptxFile($path);
  39. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  40. return $doc;
  41. }
  42. /**
  43. * Given a path to a XLSX document returns a lucene document with filename and contents set.
  44. * @param $path
  45. * @return Zend_Search_Lucene_Document
  46. */
  47. function createXLSXDocument($path){
  48. $doc = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($path);
  49. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  50. return $doc;
  51. }
  52. /**
  53. * Given a path to a HTML document returns a lucene document with filename and contents set.
  54. * @param $path
  55. * @return Zend_Search_Lucene_Document
  56. */
  57. function createHTMLDocument($path){
  58. $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path);
  59. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  60. return $doc;
  61. }
  62. /**
  63. * Given a path to a DocX document returns a lucene document with filename and contents set.
  64. * @param $path
  65. * @return Zend_Search_Lucene_Document
  66. */
  67. function createDocXDocument($path){
  68. $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($path);
  69. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  70. return $doc;
  71. }
  72. /**
  73. * Given a path to a Doc document returns a lucene document with filename and contents set.
  74. * @param $path
  75. * @return Zend_Search_Lucene_Document
  76. */
  77. function createDocDocument($path){
  78. $fileHandle = fopen($path, "r");
  79. $line = @fread($fileHandle, filesize($path));
  80. $lines = explode(chr(0x0D),$line);
  81. $outtext = "";
  82. foreach($lines as $thisline)
  83. {
  84. $pos = strpos($thisline, chr(0x00));
  85. if (($pos !== FALSE)||(strlen($thisline)==0))
  86. {
  87. } else {
  88. $outtext .= $thisline." ";
  89. }
  90. }
  91. $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);
  92. $doc = new Zend_Search_Lucene_Document();
  93. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  94. $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $outtext));
  95. fclose($fileHandle);
  96. return $doc;
  97. }
  98. /**
  99. * Given a path to a PDF document returns a lucene document with filename and contents set.
  100. * @param $path
  101. * @return Zend_Search_Lucene_Document
  102. */
  103. function createPDFDocument($path){
  104. require_once('PdfParser.php');
  105. $text = PdfParser::parseFile($path);
  106. $doc = new Zend_Search_Lucene_Document();
  107. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  108. $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $text));
  109. return $doc;
  110. }
  111. /**
  112. * Given a path to an ODT doc returns a lucene document with contents and filename set.
  113. * @param $path
  114. * @return bool|Zend_Search_Lucene_Document
  115. */
  116. function createOdtDocument($path){
  117. if(!is_file($path)){
  118. return false;
  119. }
  120. $doc = new Zend_Search_Lucene_Document();
  121. $documentBody = array();
  122. $coreProperties = array();
  123. $package = new ZipArchive();
  124. $package->open($path);
  125. $contents = simplexml_load_string($package->getFromName("content.xml"));
  126. $paragraphs = $contents->xpath('//text:*');
  127. foreach ($paragraphs as $paragraph) {
  128. $documentBody[] = (string)$paragraph;
  129. $documentBody[] = ' ';
  130. }
  131. // Close file
  132. $package->close();
  133. $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', implode(' ', $documentBody), 'UTF-8'));
  134. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  135. return $doc;
  136. }
  137. /**
  138. * Given a path to a plain text doc returns a lucene document with $filename and $contents set appropriately.
  139. * @param $path
  140. * @return Zend_Search_Lucene_Document
  141. */
  142. function createTextDocument($path){
  143. $doc = new Zend_Search_Lucene_Document();
  144. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  145. $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', file_get_contents($path)));
  146. return $doc;
  147. }
  148. /**
  149. * Given the path to an rtf document returns a lucene document with $filename and $contents set appropriately.
  150. * @param $path
  151. * @return Zend_Search_Lucene_Document
  152. */
  153. function createRTFDocument($path){
  154. $doc = new Zend_Search_Lucene_Document();
  155. $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
  156. $contents = rtf2text($path);
  157. //print_r($contents);
  158. $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $contents));
  159. return $doc;
  160. }
  161. function rtf_isPlainText($s) {
  162. $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
  163. for ($i = 0; $i < count($arrfailAt); $i++)
  164. if (!empty($s[$arrfailAt[$i]])) return false;
  165. return true;
  166. }
  167. function rtf2text($filename) {
  168. // Read the data from the input file.
  169. $text = file_get_contents($filename);
  170. if (!strlen($text))
  171. return "";
  172. // Create empty stack array.
  173. $document = "";
  174. $stack = array();
  175. $j = -1;
  176. // Read the data character-by- character…
  177. for ($i = 0, $len = strlen($text); $i < $len; $i++) {
  178. $c = $text[$i];
  179. // Depending on current character select the further actions.
  180. switch ($c) {
  181. // the most important key word backslash
  182. case "\\":
  183. // read next character
  184. $nc = $text[$i + 1];
  185. // If it is another backslash or nonbreaking space or hyphen,
  186. // then the character is plain text and add it to the output stream.
  187. if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
  188. elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
  189. elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
  190. // If it is an asterisk mark, add it to the stack.
  191. elseif ($nc == '*') $stack[$j]["*"] = true;
  192. // If it is a single quote, read next two characters that are the hexadecimal notation
  193. // of a character we should add to the output stream.
  194. elseif ($nc == "'") {
  195. $hex = substr($text, $i + 2, 2);
  196. if (rtf_isPlainText($stack[$j]))
  197. $document .= html_entity_decode("&#".hexdec($hex).";");
  198. //Shift the pointer.
  199. $i += 2;
  200. // Since, we’ve found the alphabetic character, the next characters are control word
  201. // and, possibly, some digit parameter.
  202. } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
  203. $word = "";
  204. $param = null;
  205. // Start reading characters after the backslash.
  206. for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
  207. $nc = $text[$k];
  208. // If the current character is a letter and there were no digits before it,
  209. // then we’re still reading the control word. If there were digits, we should stop
  210. // since we reach the end of the control word.
  211. if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
  212. if (empty($param))
  213. $word .= $nc;
  214. else
  215. break;
  216. // If it is a digit, store the parameter.
  217. } elseif ($nc >= '0' && $nc <= '9')
  218. $param .= $nc;
  219. // Since minus sign may occur only before a digit parameter, check whether
  220. // $param is empty. Otherwise, we reach the end of the control word.
  221. elseif ($nc == '-') {
  222. if (empty($param))
  223. $param .= $nc;
  224. else
  225. break;
  226. } else
  227. break;
  228. }
  229. // Shift the pointer on the number of read characters.
  230. $i += $m - 1;
  231. // Start analyzing what we’ve read. We are interested mostly in control words.
  232. $toText = "";
  233. switch (strtolower($word)) {
  234. // If the control word is "u", then its parameter is the decimal notation of the
  235. // Unicode character that should be added to the output stream.
  236. // We need to check whether the stack contains \ucN control word. If it does,
  237. // we should remove the N characters from the output stream.
  238. case "u":
  239. $toText .= html_entity_decode("&#x".dechex($param).";");
  240. $ucDelta = @$stack[$j]["uc"];
  241. if ($ucDelta > 0)
  242. $i += $ucDelta;
  243. break;
  244. // Select line feeds, spaces and tabs.
  245. case "par": case "page": case "column": case "line": case "lbr":
  246. $toText .= "\n";
  247. break;
  248. case "emspace": case "enspace": case "qmspace":
  249. $toText .= " ";
  250. break;
  251. case "tab": $toText .= "\t"; break;
  252. // Add current date and time instead of corresponding labels.
  253. case "chdate": $toText .= date("m.d.Y"); break;
  254. case "chdpl": $toText .= date("l, j F Y"); break;
  255. case "chdpa": $toText .= date("D, j M Y"); break;
  256. case "chtime": $toText .= date("H:i:s"); break;
  257. // Replace some reserved characters to their html analogs.
  258. case "emdash": $toText .= html_entity_decode("&mdash;"); break;
  259. case "endash": $toText .= html_entity_decode("&ndash;"); break;
  260. case "bullet": $toText .= html_entity_decode("&#149;"); break;
  261. case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
  262. case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
  263. case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
  264. case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
  265. // Add all other to the control words stack. If a control word
  266. // does not include parameters, set &param to true.
  267. default:
  268. $stack[$j][strtolower($word)] = empty($param) ? true : $param;
  269. break;
  270. }
  271. // Add data to the output stream if required.
  272. if (rtf_isPlainText($stack[$j]))
  273. $document .= $toText;
  274. }
  275. $i++;
  276. break;
  277. // If we read the opening brace {, then new subgroup starts and we add
  278. // new array stack element and write the data from previous stack element to it.
  279. case "{":
  280. array_push($stack, $stack[$j++]);
  281. break;
  282. // If we read the closing brace }, then we reach the end of subgroup and should remove
  283. // the last stack element.
  284. case "}":
  285. array_pop($stack);
  286. $j--;
  287. break;
  288. // Skip “trash”.
  289. case '\0': case '\r': case '\f': case '\n': break;
  290. // Add other data to the output stream if required.
  291. default:
  292. if (rtf_isPlainText($stack[$j]))
  293. $document .= $c;
  294. break;
  295. }
  296. }
  297. // Return result.
  298. return $document;
  299. }