PageRenderTime 38ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/includes/libs/StringUtils.php

https://gitlab.com/link233/bootmw
PHP | 288 lines | 124 code | 23 blank | 141 comment | 23 complexity | cb3266e74fdb4da9d8dc897c748920b6 MD5 | raw file
  1. <?php
  2. /**
  3. * Methods to play with strings.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. */
  22. /**
  23. * A collection of static methods to play with strings.
  24. */
  25. class StringUtils {
  26. /**
  27. * Test whether a string is valid UTF-8.
  28. *
  29. * The function check for invalid byte sequences, overlong encoding but
  30. * not for different normalisations.
  31. *
  32. * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  33. * In particular, the pure PHP code path did not in fact check for overlong forms.
  34. * Beware of this when backporting code to that version of MediaWiki.
  35. *
  36. * @since 1.21
  37. * @param string $value String to check
  38. * @return bool Whether the given $value is a valid UTF-8 encoded string
  39. */
  40. static function isUtf8( $value ) {
  41. $value = (string)$value;
  42. // HHVM 3.4 and older come with an outdated version of libmbfl that
  43. // incorrectly allows values above U+10FFFF, so we have to check
  44. // for them separately. (This issue also exists in PHP 5.3 and
  45. // older, which are no longer supported.)
  46. static $newPHP;
  47. if ( $newPHP === null ) {
  48. $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
  49. }
  50. return mb_check_encoding( $value, 'UTF-8' ) &&
  51. ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
  52. }
  53. /**
  54. * Perform an operation equivalent to `preg_replace()`
  55. *
  56. * Matches this code:
  57. *
  58. * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
  59. *
  60. * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
  61. * implementation is fast but memory-hungry and inflexible. The memory requirements are such
  62. * that I don't recommend using it on anything but guaranteed small chunks of text.
  63. *
  64. * @param string $startDelim
  65. * @param string $endDelim
  66. * @param string $replace
  67. * @param string $subject
  68. * @return string
  69. */
  70. static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
  71. $segments = explode( $startDelim, $subject );
  72. $output = array_shift( $segments );
  73. foreach ( $segments as $s ) {
  74. $endDelimPos = strpos( $s, $endDelim );
  75. if ( $endDelimPos === false ) {
  76. $output .= $startDelim . $s;
  77. } else {
  78. $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
  79. }
  80. }
  81. return $output;
  82. }
  83. /**
  84. * Perform an operation equivalent to `preg_replace_callback()`
  85. *
  86. * Matches this code:
  87. *
  88. * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
  89. *
  90. * If the start delimiter ends with an initial substring of the end delimiter,
  91. * e.g. in the case of C-style comments, the behavior differs from the model
  92. * regex. In this implementation, the end must share no characters with the
  93. * start, so e.g. `/*\/` is not considered to be both the start and end of a
  94. * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
  95. *
  96. * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
  97. * but uses far less memory. The delimiters are literal strings, not regular expressions.
  98. *
  99. * @param string $startDelim Start delimiter
  100. * @param string $endDelim End delimiter
  101. * @param callable $callback Function to call on each match
  102. * @param string $subject
  103. * @param string $flags Regular expression flags
  104. * @throws InvalidArgumentException
  105. * @return string
  106. */
  107. static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
  108. $subject, $flags = ''
  109. ) {
  110. $inputPos = 0;
  111. $outputPos = 0;
  112. $output = '';
  113. $foundStart = false;
  114. $encStart = preg_quote( $startDelim, '!' );
  115. $encEnd = preg_quote( $endDelim, '!' );
  116. $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
  117. $endLength = strlen( $endDelim );
  118. $m = [];
  119. while ( $inputPos < strlen( $subject ) &&
  120. preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
  121. ) {
  122. $tokenOffset = $m[0][1];
  123. if ( $m[1][0] != '' ) {
  124. if ( $foundStart &&
  125. $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
  126. ) {
  127. # An end match is present at the same location
  128. $tokenType = 'end';
  129. $tokenLength = $endLength;
  130. } else {
  131. $tokenType = 'start';
  132. $tokenLength = strlen( $m[0][0] );
  133. }
  134. } elseif ( $m[2][0] != '' ) {
  135. $tokenType = 'end';
  136. $tokenLength = strlen( $m[0][0] );
  137. } else {
  138. throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
  139. }
  140. if ( $tokenType == 'start' ) {
  141. # Only move the start position if we haven't already found a start
  142. # This means that START START END matches outer pair
  143. if ( !$foundStart ) {
  144. # Found start
  145. $inputPos = $tokenOffset + $tokenLength;
  146. # Write out the non-matching section
  147. $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
  148. $outputPos = $tokenOffset;
  149. $contentPos = $inputPos;
  150. $foundStart = true;
  151. } else {
  152. # Move the input position past the *first character* of START,
  153. # to protect against missing END when it overlaps with START
  154. $inputPos = $tokenOffset + 1;
  155. }
  156. } elseif ( $tokenType == 'end' ) {
  157. if ( $foundStart ) {
  158. # Found match
  159. $output .= call_user_func( $callback, [
  160. substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
  161. substr( $subject, $contentPos, $tokenOffset - $contentPos )
  162. ] );
  163. $foundStart = false;
  164. } else {
  165. # Non-matching end, write it out
  166. $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
  167. }
  168. $inputPos = $outputPos = $tokenOffset + $tokenLength;
  169. } else {
  170. throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
  171. }
  172. }
  173. if ( $outputPos < strlen( $subject ) ) {
  174. $output .= substr( $subject, $outputPos );
  175. }
  176. return $output;
  177. }
  178. /**
  179. * Perform an operation equivalent to `preg_replace()` with flags.
  180. *
  181. * Matches this code:
  182. *
  183. * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
  184. *
  185. * @param string $startDelim Start delimiter regular expression
  186. * @param string $endDelim End delimiter regular expression
  187. * @param string $replace Replacement string. May contain $1, which will be
  188. * replaced by the text between the delimiters
  189. * @param string $subject String to search
  190. * @param string $flags Regular expression flags
  191. * @return string The string with the matches replaced
  192. */
  193. static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
  194. $replacer = new RegexlikeReplacer( $replace );
  195. return self::delimiterReplaceCallback( $startDelim, $endDelim,
  196. $replacer->cb(), $subject, $flags );
  197. }
  198. /**
  199. * More or less "markup-safe" explode()
  200. * Ignores any instances of the separator inside `<...>`
  201. * @param string $separator
  202. * @param string $text
  203. * @return array
  204. */
  205. static function explodeMarkup( $separator, $text ) {
  206. $placeholder = "\x00";
  207. // Remove placeholder instances
  208. $text = str_replace( $placeholder, '', $text );
  209. // Replace instances of the separator inside HTML-like tags with the placeholder
  210. $replacer = new DoubleReplacer( $separator, $placeholder );
  211. $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
  212. // Explode, then put the replaced separators back in
  213. $items = explode( $separator, $cleaned );
  214. foreach ( $items as $i => $str ) {
  215. $items[$i] = str_replace( $placeholder, $separator, $str );
  216. }
  217. return $items;
  218. }
  219. /**
  220. * More or less "markup-safe" str_replace()
  221. * Ignores any instances of the separator inside `<...>`
  222. * @param string $search
  223. * @param string $replace
  224. * @param string $text
  225. * @return string
  226. */
  227. static function replaceMarkup( $search, $replace, $text ) {
  228. $placeholder = "\x00";
  229. // Remove placeholder instances
  230. $text = str_replace( $placeholder, '', $text );
  231. // Replace instances of the separator inside HTML-like tags with the placeholder
  232. $replacer = new DoubleReplacer( $search, $placeholder );
  233. $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
  234. // Explode, then put the replaced separators back in
  235. $cleaned = str_replace( $search, $replace, $cleaned );
  236. $text = str_replace( $placeholder, $search, $cleaned );
  237. return $text;
  238. }
  239. /**
  240. * Escape a string to make it suitable for inclusion in a preg_replace()
  241. * replacement parameter.
  242. *
  243. * @param string $string
  244. * @return string
  245. */
  246. static function escapeRegexReplacement( $string ) {
  247. $string = str_replace( '\\', '\\\\', $string );
  248. $string = str_replace( '$', '\\$', $string );
  249. return $string;
  250. }
  251. /**
  252. * Workalike for explode() with limited memory usage.
  253. *
  254. * @param string $separator
  255. * @param string $subject
  256. * @return ArrayIterator|ExplodeIterator
  257. */
  258. static function explode( $separator, $subject ) {
  259. if ( substr_count( $subject, $separator ) > 1000 ) {
  260. return new ExplodeIterator( $separator, $subject );
  261. } else {
  262. return new ArrayIterator( explode( $separator, $subject ) );
  263. }
  264. }
  265. }