/lib/spout/src/Spout/Common/Helper/Escaper/XLSX.php

https://github.com/markn86/moodle · PHP · 190 lines · 76 code · 25 blank · 89 comment · 4 complexity · eb6c1e8f5ba0444ab01191ea499b0c50 MD5 · raw file

  1. <?php
  2. namespace Box\Spout\Common\Helper\Escaper;
  3. /**
  4. * Class XLSX
  5. * Provides functions to escape and unescape data for XLSX files
  6. */
  7. class XLSX implements EscaperInterface
  8. {
  9. /** @var bool Whether the escaper has already been initialized */
  10. private $isAlreadyInitialized = false;
  11. /** @var string Regex pattern to detect control characters that need to be escaped */
  12. private $escapableControlCharactersPattern;
  13. /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
  14. private $controlCharactersEscapingMap;
  15. /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
  16. private $controlCharactersEscapingReverseMap;
  17. /**
  18. * Initializes the control characters if not already done
  19. */
  20. protected function initIfNeeded()
  21. {
  22. if (!$this->isAlreadyInitialized) {
  23. $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
  24. $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
  25. $this->controlCharactersEscapingReverseMap = \array_flip($this->controlCharactersEscapingMap);
  26. $this->isAlreadyInitialized = true;
  27. }
  28. }
  29. /**
  30. * Escapes the given string to make it compatible with XLSX
  31. *
  32. * @param string $string The string to escape
  33. * @return string The escaped string
  34. */
  35. public function escape($string)
  36. {
  37. $this->initIfNeeded();
  38. $escapedString = $this->escapeControlCharacters($string);
  39. // @NOTE: Using ENT_QUOTES as XML entities ('<', '>', '&') as well as
  40. // single/double quotes (for XML attributes) need to be encoded.
  41. $escapedString = \htmlspecialchars($escapedString, ENT_QUOTES, 'UTF-8');
  42. return $escapedString;
  43. }
  44. /**
  45. * Unescapes the given string to make it compatible with XLSX
  46. *
  47. * @param string $string The string to unescape
  48. * @return string The unescaped string
  49. */
  50. public function unescape($string)
  51. {
  52. $this->initIfNeeded();
  53. // ==============
  54. // = WARNING =
  55. // ==============
  56. // It is assumed that the given string has already had its XML entities decoded.
  57. // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
  58. // Therefore there is no need to call "htmlspecialchars_decode()".
  59. $unescapedString = $this->unescapeControlCharacters($string);
  60. return $unescapedString;
  61. }
  62. /**
  63. * @return string Regex pattern containing all escapable control characters
  64. */
  65. protected function getEscapableControlCharactersPattern()
  66. {
  67. // control characters values are from 0 to 1F (hex values) in the ASCII table
  68. // some characters should not be escaped though: "\t", "\r" and "\n".
  69. return '[\x00-\x08' .
  70. // skipping "\t" (0x9) and "\n" (0xA)
  71. '\x0B-\x0C' .
  72. // skipping "\r" (0xD)
  73. '\x0E-\x1F]';
  74. }
  75. /**
  76. * Builds the map containing control characters to be escaped
  77. * mapped to their escaped values.
  78. * "\t", "\r" and "\n" don't need to be escaped.
  79. *
  80. * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
  81. * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
  82. *
  83. * @return string[]
  84. */
  85. protected function getControlCharactersEscapingMap()
  86. {
  87. $controlCharactersEscapingMap = [];
  88. // control characters values are from 0 to 1F (hex values) in the ASCII table
  89. for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
  90. $character = \chr($charValue);
  91. if (\preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
  92. $charHexValue = \dechex($charValue);
  93. $escapedChar = '_x' . \sprintf('%04s', \strtoupper($charHexValue)) . '_';
  94. $controlCharactersEscapingMap[$escapedChar] = $character;
  95. }
  96. }
  97. return $controlCharactersEscapingMap;
  98. }
  99. /**
  100. * Converts PHP control characters from the given string to OpenXML escaped control characters
  101. *
  102. * Excel escapes control characters with _xHHHH_ and also escapes any
  103. * literal strings of that type by encoding the leading underscore.
  104. * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
  105. *
  106. * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
  107. * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
  108. *
  109. * @param string $string String to escape
  110. * @return string
  111. */
  112. protected function escapeControlCharacters($string)
  113. {
  114. $escapedString = $this->escapeEscapeCharacter($string);
  115. // if no control characters
  116. if (!\preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
  117. return $escapedString;
  118. }
  119. return \preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function ($matches) {
  120. return $this->controlCharactersEscapingReverseMap[$matches[0]];
  121. }, $escapedString);
  122. }
  123. /**
  124. * Escapes the escape character: "_x0000_" -> "_x005F_x0000_"
  125. *
  126. * @param string $string String to escape
  127. * @return string The escaped string
  128. */
  129. protected function escapeEscapeCharacter($string)
  130. {
  131. return \preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string);
  132. }
  133. /**
  134. * Converts OpenXML escaped control characters from the given string to PHP control characters
  135. *
  136. * Excel escapes control characters with _xHHHH_ and also escapes any
  137. * literal strings of that type by encoding the leading underscore.
  138. * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_"
  139. *
  140. * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
  141. * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
  142. *
  143. * @param string $string String to unescape
  144. * @return string
  145. */
  146. protected function unescapeControlCharacters($string)
  147. {
  148. $unescapedString = $string;
  149. foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
  150. // only unescape characters that don't contain the escaped escape character for now
  151. $unescapedString = \preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);
  152. }
  153. return $this->unescapeEscapeCharacter($unescapedString);
  154. }
  155. /**
  156. * Unecapes the escape character: "_x005F_x0000_" => "_x0000_"
  157. *
  158. * @param string $string String to unescape
  159. * @return string The unescaped string
  160. */
  161. protected function unescapeEscapeCharacter($string)
  162. {
  163. return \preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string);
  164. }
  165. }