/libraries/Zend/Search/Lucene/Document/Docx.php

https://github.com/kiranatama/sagalaya · PHP · 156 lines · 73 code · 21 blank · 62 comment · 12 complexity · 9e2a3c5dae5af20f6279b1e03dbd3c20 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Document
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. namespace Zend\Search\Lucene\Document;
  22. use Zend\Search\Lucene,
  23. Zend\Search\Lucene\Exception\ExtensionNotLoadedException,
  24. Zend\Search\Lucene\Exception\RuntimeException,
  25. Zend\Search\Lucene\Document\Exception\InvalidArgumentException;
  26. /**
  27. * Docx document.
  28. *
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Document
  32. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Docx extends AbstractOpenXML {
  36. /**
  37. * Xml Schema - WordprocessingML
  38. *
  39. * @var string
  40. */
  41. const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
  42. /**
  43. * Object constructor
  44. *
  45. * @param string $fileName
  46. * @param boolean $storeContent
  47. * @throws \Zend\Search\Lucene\Exception\ExtensionNotLoadedException
  48. * @throws \Zend\Search\Lucene\Exception\RuntimeException
  49. */
  50. private function __construct($fileName, $storeContent)
  51. {
  52. if (!class_exists('ZipArchive', false)) {
  53. throw new ExtensionNotLoadedException(
  54. 'MS Office documents processing functionality requires Zip extension to be loaded'
  55. );
  56. }
  57. // Document data holders
  58. $documentBody = array();
  59. $coreProperties = array();
  60. // Open AbstractOpenXML package
  61. $package = new \ZipArchive();
  62. $package->open($fileName);
  63. // Read relations and search for officeDocument
  64. $relationsXml = $package->getFromName('_rels/.rels');
  65. if ($relationsXml === false) {
  66. throw new RuntimeException('Invalid archive or corrupted .docx file.');
  67. }
  68. $relations = simplexml_load_string($relationsXml);
  69. foreach($relations->Relationship as $rel) {
  70. if ($rel ["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
  71. // Found office document! Read in contents...
  72. $contents = simplexml_load_string($package->getFromName(
  73. $this->absoluteZipPath(dirname($rel['Target'])
  74. . '/'
  75. . basename($rel['Target']))
  76. ));
  77. $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML);
  78. $paragraphs = $contents->xpath('//w:body/w:p');
  79. foreach ($paragraphs as $paragraph) {
  80. $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
  81. if ($runs === false) {
  82. // Paragraph doesn't contain any text or breaks
  83. continue;
  84. }
  85. foreach ($runs as $run) {
  86. if ($run->getName() == 'br') {
  87. // Break element
  88. $documentBody[] = ' ';
  89. } else {
  90. $documentBody[] = (string)$run;
  91. }
  92. }
  93. // Add space after each paragraph. So they are not bound together.
  94. $documentBody[] = ' ';
  95. }
  96. break;
  97. }
  98. }
  99. // Read core properties
  100. $coreProperties = $this->extractMetaData($package);
  101. // Close file
  102. $package->close();
  103. // Store filename
  104. $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
  105. // Store contents
  106. if ($storeContent) {
  107. $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8'));
  108. } else {
  109. $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
  110. }
  111. // Store meta data properties
  112. foreach ($coreProperties as $key => $value) {
  113. $this->addField(Field::Text($key, $value, 'UTF-8'));
  114. }
  115. // Store title (if not present in meta data)
  116. if (! isset($coreProperties['title'])) {
  117. $this->addField(Field::Text('title', $fileName, 'UTF-8'));
  118. }
  119. }
  120. /**
  121. * Load Docx document from a file
  122. *
  123. * @param string $fileName
  124. * @param boolean $storeContent
  125. * @throws \Zend\Search\Lucene\Document\Exception\InvalidArgumentException
  126. * @return \Zend\Search\Lucene\Document\Docx
  127. */
  128. public static function loadDocxFile($fileName, $storeContent = false)
  129. {
  130. if (!is_readable($fileName)) {
  131. throw new InvalidArgumentException('Provided file \'' . $fileName . '\' is not readable.');
  132. }
  133. return new self($fileName, $storeContent);
  134. }
  135. }