/library/Zend/Search/Lucene/Document/Pptx.php

https://bitbucket.org/Ebozavrik/test-application · PHP · 200 lines · 87 code · 26 blank · 87 comment · 12 complexity · 6a5447e326663750d2515ea6fcb967b5 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Document
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Pptx.php 24593 2012-01-05 20:35:02Z matthew $
  21. */
  22. /** Zend_Search_Lucene_Document_OpenXml */
  23. require_once 'Zend/Search/Lucene/Document/OpenXml.php';
  24. /**
  25. * Pptx document.
  26. *
  27. * @category Zend
  28. * @package Zend_Search_Lucene
  29. * @subpackage Document
  30. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  31. * @license http://framework.zend.com/license/new-bsd New BSD License
  32. */
  33. class Zend_Search_Lucene_Document_Pptx extends Zend_Search_Lucene_Document_OpenXml
  34. {
  35. /**
  36. * Xml Schema - PresentationML
  37. *
  38. * @var string
  39. */
  40. const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
  41. /**
  42. * Xml Schema - DrawingML
  43. *
  44. * @var string
  45. */
  46. const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
  47. /**
  48. * Xml Schema - Slide relation
  49. *
  50. * @var string
  51. */
  52. const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
  53. /**
  54. * Xml Schema - Slide notes relation
  55. *
  56. * @var string
  57. */
  58. const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
  59. /**
  60. * Object constructor
  61. *
  62. * @param string $fileName
  63. * @param boolean $storeContent
  64. *
  65. * @throws Zend_Search_Lucene_Exception
  66. */
  67. private function __construct ($fileName, $storeContent)
  68. {
  69. if (!class_exists('ZipArchive', false)) {
  70. require_once 'Zend/Search/Lucene/Exception.php';
  71. throw new Zend_Search_Lucene_Exception( 'MS Office documents processing functionality requires Zip extension to be loaded' );
  72. }
  73. // Document data holders
  74. $slides = array();
  75. $slideNotes = array();
  76. $documentBody = array();
  77. $coreProperties = array();
  78. // Open OpenXML package
  79. $package = new ZipArchive();
  80. $package->open($fileName);
  81. // Read relations and search for officeDocument
  82. $relationsXml = $package->getFromName('_rels/.rels');
  83. if ($relationsXml === false) {
  84. require_once 'Zend/Search/Lucene/Exception.php';
  85. throw new Zend_Search_Lucene_Exception( 'Invalid archive or corrupted .pptx file.' );
  86. }
  87. $relations = simplexml_load_string($relationsXml);
  88. foreach ($relations->Relationship as $rel) {
  89. if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
  90. // Found office document! Search for slides...
  91. $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
  92. foreach ($slideRelations->Relationship as $slideRel) {
  93. if ($slideRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDERELATION) {
  94. // Found slide!
  95. $slides[str_replace('rId', '', (string)$slideRel["Id"])] = simplexml_load_string(
  96. $package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])))
  97. );
  98. // Search for slide notes
  99. $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")));
  100. foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
  101. if ($slideNoteRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDENOTESRELATION) {
  102. // Found slide notes!
  103. $slideNotes[str_replace('rId', '', (string)$slideRel["Id"])] = simplexml_load_string(
  104. $package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])))
  105. );
  106. break;
  107. }
  108. }
  109. }
  110. }
  111. break;
  112. }
  113. }
  114. // Sort slides
  115. ksort($slides);
  116. ksort($slideNotes);
  117. // Extract contents from slides
  118. foreach ($slides as $slideKey => $slide) {
  119. // Register namespaces
  120. $slide->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
  121. $slide->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
  122. // Fetch all text
  123. $textElements = $slide->xpath('//a:t');
  124. foreach ($textElements as $textElement) {
  125. $documentBody[] = (string)$textElement;
  126. }
  127. // Extract contents from slide notes
  128. if (isset( $slideNotes[$slideKey] )) {
  129. // Fetch slide note
  130. $slideNote = $slideNotes[$slideKey];
  131. // Register namespaces
  132. $slideNote->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
  133. $slideNote->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
  134. // Fetch all text
  135. $textElements = $slideNote->xpath('//a:t');
  136. foreach ($textElements as $textElement) {
  137. $documentBody[] = (string)$textElement;
  138. }
  139. }
  140. }
  141. // Read core properties
  142. $coreProperties = $this->extractMetaData($package);
  143. // Close file
  144. $package->close();
  145. // Store filename
  146. $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
  147. // Store contents
  148. if ($storeContent) {
  149. $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
  150. } else {
  151. $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
  152. }
  153. // Store meta data properties
  154. foreach ($coreProperties as $key => $value) {
  155. $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
  156. }
  157. // Store title (if not present in meta data)
  158. if (!isset( $coreProperties['title'] )) {
  159. $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
  160. }
  161. }
  162. /**
  163. * Load Pptx document from a file
  164. *
  165. * @param string $fileName
  166. * @param boolean $storeContent
  167. *
  168. * @return Zend_Search_Lucene_Document_Pptx
  169. */
  170. public static function loadPptxFile ($fileName, $storeContent = false)
  171. {
  172. return new Zend_Search_Lucene_Document_Pptx( $fileName, $storeContent );
  173. }
  174. }