/classes/multibyte.php

https://github.com/rynodivino/system · PHP · 820 lines · 374 code · 146 blank · 300 comment · 98 complexity · 83b3db99b81c628855d88707642c69cc MD5 · raw file

  1. <?php
  2. /*
  3. * @package Habari
  4. *
  5. */
  6. /*
  7. * Habari MultiByte Class
  8. *
  9. * Provides multibyte character set services,
  10. * a necessity since all of Habari's internal string
  11. * manipulations are done in UTF-8. Currently
  12. * this class is a wrapper around mbstring functions.
  13. *
  14. */
  15. class MultiByte
  16. {
  17. const USE_MBSTRING = 1;
  18. /*
  19. * @var $hab_enc String holding the current encoding the class is using
  20. */
  21. static $hab_enc = 'UTF-8';
  22. /*
  23. * @var $use_library Integer denoting the current multibyte
  24. * library the class is using
  25. */
  26. private static $use_library = self::USE_MBSTRING;
  27. /**
  28. * function __construct
  29. *
  30. * An empty constructor since all functions are static
  31. */
  32. private function __construct()
  33. {
  34. }
  35. /*
  36. * function hab_encoding
  37. *
  38. * Sets and returns the internal encoding.
  39. *
  40. * @param $use_enc string. The encoding to be used
  41. *
  42. * @return string. If $enc is null, returns the current
  43. * encoding. If $enc is not null, returns the old encoding
  44. */
  45. public static function hab_encoding( $use_enc = null )
  46. {
  47. if ( $use_enc === null ) {
  48. return self::$hab_enc;
  49. }
  50. else {
  51. $old_enc = self::$hab_enc;
  52. self::$hab_enc = $use_enc;
  53. return $old_enc;
  54. }
  55. }
  56. /*
  57. * function library
  58. *
  59. * Sets and returns the multibyte library being used internally
  60. *
  61. * @param $int The new library to use. One of the self::USE_* constants, null to simply return, or false to disable and use native non-multibyte-safe PHP methods.
  62. *
  63. * @return mixed If $new_library is null, returns the current library
  64. * being used. If $new_library has a valid value, returns the old library,
  65. * else returns false.
  66. */
  67. public static function library( $new_library = null )
  68. {
  69. if ( $new_library === null ) {
  70. return self::$use_library;
  71. }
  72. else if ( $new_library === self::USE_MBSTRING ) {
  73. $old_library = self::$use_library;
  74. self::$use_library = $new_library;
  75. return $old_library;
  76. }
  77. else if ( $new_library === false ) {
  78. $old_library = self::$use_library;
  79. self::$use_library = $new_library;
  80. return $old_library;
  81. }
  82. else {
  83. return false;
  84. }
  85. }
  86. /*
  87. * function convert_encoding
  88. *
  89. * Converts a string's encoding to a new encoding
  90. *
  91. * @param $str string. The string who's encoding is being changed.
  92. * @param $use_enc string. The encoding to convert to. If not set,
  93. * the internal encoding will be used.
  94. * @param $from_enc string. encoding before conversion. If not set,
  95. * encoding is detected automatically.
  96. *
  97. * @return mixed The source string in the new encoding or boolean false.
  98. */
  99. public static function convert_encoding( $str, $use_enc = null, $from_enc = null )
  100. {
  101. $ret = false;
  102. $enc = self::$hab_enc;
  103. if ( $use_enc !== null ) {
  104. $enc = $use_enc;
  105. }
  106. if ( self::$use_library == self::USE_MBSTRING ) {
  107. if ( $from_enc == null ) {
  108. $from_enc = MultiByte::detect_encoding( $str );
  109. }
  110. $ret = mb_convert_encoding( $str, $enc, $from_enc );
  111. }
  112. return $ret;
  113. }
  114. /*
  115. * function detect_encoding
  116. *
  117. * Detects the encoding being used for a string
  118. *
  119. * @param $str string. The string whose encoding is being detected
  120. *
  121. * @return mixed The source string's detected encoding, or boolean false.
  122. */
  123. public static function detect_encoding( $str )
  124. {
  125. $enc = false;
  126. if ( self::$use_library == self::USE_MBSTRING ) {
  127. // get original detection order
  128. $old_order = mb_detect_order();
  129. // make sure ISO-8859-1 is included
  130. mb_detect_order( array( 'ASCII', 'JIS', 'UTF-8', 'ISO-8859-1', 'EUC-JP', 'SJIS' ) );
  131. //detect the encoding . the detected encoding may be wrong, but it's better than guessing
  132. $enc = mb_detect_encoding( $str );
  133. // reset detection order
  134. mb_detect_order( $old_order );
  135. }
  136. return $enc;
  137. }
  138. /*
  139. * function detect_bom_encoding
  140. *
  141. * Detects the encoding being used for a string using the existence
  142. * of a byte order mark
  143. *
  144. * @param $str string. The string whose encoding is being detected
  145. *
  146. * @return mixed The source string's detected encoding, or boolean false.
  147. */
  148. public static function detect_bom_encoding( $str )
  149. {
  150. $ret = false;
  151. if ( "\xFE\xFF" == substr( 0, 2, $source_contents ) ) {
  152. $ret = 'UTF-16BE';
  153. }
  154. else if ( "\xFF\xFE" == substr( 0, 2, $source_contents ) ) {
  155. $ret = 'UTF-16LE';
  156. }
  157. else if ( "\xEF\xBB\xBF" == substr( 0, 3, $source_contents ) ) {
  158. $ret = 'UTF-8';
  159. }
  160. return $ret;
  161. }
  162. /*
  163. * function substr
  164. *
  165. * Get a section of a string
  166. *
  167. * @param $str string. The original string
  168. * @param $begin. integer. The beginning character of the string to return.
  169. * @param $len integer. How long the returned string should be. If $len is
  170. * not set, the section of the string from $begin to the end of the string is
  171. * returned.
  172. * @param $use_enc string. The encoding to be used. If not set,
  173. * the internal encoding will be used.
  174. *
  175. * @return mixed The section of the source string requested in the encoding requested or false.
  176. * If $len is not set, returns substring from $begin to end of string.
  177. *
  178. */
  179. public static function substr( $str, $begin, $len = null, $use_enc = null )
  180. {
  181. $ret = false;
  182. $enc = self::$hab_enc;
  183. if ( $use_enc !== null ) {
  184. $enc = $use_enc;
  185. }
  186. if ( self::$use_library == self::USE_MBSTRING ) {
  187. if ( ! isset( $len ) ) {
  188. $len = MultiByte::strlen( $str ) - $begin;
  189. }
  190. $ret = mb_substr( $str, $begin, $len, $enc );
  191. }
  192. else {
  193. $ret = substr( $str, $begin, $len );
  194. }
  195. return $ret;
  196. }
  197. /*
  198. * function strlen
  199. *
  200. * Gets the length of a string in characters
  201. *
  202. * @param $str string. The string who's length is being returned.
  203. * @param $use_enc string. The encoding to be used. If not set,
  204. * the internal encoding will be used.
  205. *
  206. * @return integer. The length in characters of the string, or the length in bytes if a valid
  207. * multibyte library isn't loaded.
  208. */
  209. public static function strlen( $str, $use_enc = null )
  210. {
  211. $len = 0;
  212. $enc = self::$hab_enc;
  213. if ( $use_enc !== null ) {
  214. $enc = $use_enc;
  215. }
  216. if ( self::$use_library == self::USE_MBSTRING ) {
  217. $len = mb_strlen( $str, $enc );
  218. }
  219. else {
  220. $len = strlen( $str );
  221. }
  222. return $len;
  223. }
  224. /*
  225. * function strpos
  226. *
  227. * Find position of first occurrence of string in a string
  228. *
  229. * @param $haysack string. The string being checked.
  230. * @param $needle. string. The position counted from the beginning of haystack .
  231. * @param $offset integer. The search offset. If it is not specified, 0 is used.
  232. * @param $use_enc string. The encoding to be used. If not set,
  233. * the internal encoding will be used.
  234. *
  235. * @return mixed The section of the source string requested in the encoding requested or false.
  236. * If $len is not set, returns substring from $begin to end of string.
  237. *
  238. */
  239. public static function strpos( $haysack, $needle, $offset = 0, $use_enc = null )
  240. {
  241. $enc = self::$hab_enc;
  242. if ( $use_enc !== null ) {
  243. $enc = $use_enc;
  244. }
  245. if ( self::$use_library == self::USE_MBSTRING ) {
  246. $ret = mb_strpos( $haysack, $needle, $offset, $enc );
  247. }
  248. else {
  249. $ret = strpos( $haysack, $needle, $offset );
  250. }
  251. return $ret;
  252. }
  253. /*
  254. * function stripos
  255. *
  256. * Find position of first occurrence of string in a string. Case insensitive.
  257. *
  258. * @param $haysack string. The string being checked.
  259. * @param $needle. string. The position counted from the beginning of haystack .
  260. * @param $offset integer. The search offset. If it is not specified, 0 is used.
  261. * @param $use_enc string. The encoding to be used. If not set,
  262. * the internal encoding will be used.
  263. *
  264. * @return mixed The section of the source string requested in the encoding requested or false.
  265. * If $len is not set, returns substring from $begin to end of string.
  266. *
  267. */
  268. public static function stripos( $haysack, $needle, $offset = 0, $use_enc = null )
  269. {
  270. $enc = self::$hab_enc;
  271. if ( $use_enc !== null ) {
  272. $enc = $use_enc;
  273. }
  274. if ( self::$use_library == self::USE_MBSTRING ) {
  275. $ret = mb_stripos( $haysack, $needle, $offset, $enc );
  276. }
  277. else {
  278. $ret = stripos( $haysack, $needle, $offset );
  279. }
  280. return $ret;
  281. }
  282. /*
  283. * function strrpos
  284. *
  285. * Find position of last occurrence of string in a string.
  286. *
  287. * @param $haysack string. The string being checked.
  288. * @param $needle. string. The position counted from the beginning of haystack .
  289. * @param $offset integer. The search offset. If it is not specified, 0 is used.
  290. * @param $use_enc string. The encoding to be used. If not set,
  291. * the internal encoding will be used.
  292. *
  293. * @return mixed The section of the source string requested in the encoding requested or false.
  294. * If $len is not set, returns substring from $begin to end of string.
  295. *
  296. */
  297. public static function strrpos( $haysack, $needle, $offset = 0, $use_enc = null )
  298. {
  299. $enc = self::$hab_enc;
  300. if ( $use_enc !== null ) {
  301. $enc = $use_enc;
  302. }
  303. if ( self::$use_library == self::USE_MBSTRING ) {
  304. $ret = mb_strrpos( $haysack, $needle, $offset, $enc );
  305. }
  306. else {
  307. $ret = strrpos( $haysack, $needle, $offset );
  308. }
  309. return $ret;
  310. }
  311. /*
  312. * function strripos
  313. *
  314. * Find position of last occurrence of string in a string. Case insensitive.
  315. *
  316. * @param $haysack string. The string being checked.
  317. * @param $needle. string. The position counted from the beginning of haystack .
  318. * @param $offset integer. The search offset. If it is not specified, 0 is used.
  319. * @param $use_enc string. The encoding to be used. If not set,
  320. * the internal encoding will be used.
  321. *
  322. * @return mixed The section of the source string requested in the encoding requested or false.
  323. * If $len is not set, returns substring from $begin to end of string.
  324. *
  325. */
  326. public static function strripos( $haysack, $needle, $offset = 0, $use_enc = null )
  327. {
  328. $enc = self::$hab_enc;
  329. if ( $use_enc !== null ) {
  330. $enc = $use_enc;
  331. }
  332. if ( self::$use_library == self::USE_MBSTRING ) {
  333. $ret = mb_strripos( $haysack, $needle, $offset, $enc );
  334. }
  335. else {
  336. $ret = strripos( $haysack, $needle, $offset );
  337. }
  338. return $ret;
  339. }
  340. /*
  341. * function strtolower
  342. *
  343. * Converts a multibyte string to lowercase. If a valid multibyte library
  344. * isn't loaded, strtolower() will be used, which can lead to unexpected results.
  345. *
  346. * @param $str string. The string to lowercase
  347. * @param $use_enc string. The encoding to be used. If not set,
  348. * the internal encoding will be used.
  349. *
  350. * @return string. The lowercased string.
  351. */
  352. public static function strtolower( $str, $use_enc = null )
  353. {
  354. $enc = self::$hab_enc;
  355. if ( $use_enc !== null ) {
  356. $enc = $use_enc;
  357. }
  358. if ( self::$use_library == self::USE_MBSTRING ) {
  359. $ret = mb_strtolower( $str, $enc );
  360. }
  361. else {
  362. $ret = strtolower( $str );
  363. }
  364. return $ret;
  365. }
  366. /*
  367. * function strtoupper
  368. *
  369. * Converts a multibyte string to uppercase. If a valid multibyte library
  370. * isn't loaded, strtoupper() will be used, which can lead to unexpected results.
  371. *
  372. * @param $str string. The string to uppercase
  373. * @param $use_enc string. The encoding to be used. If not set,
  374. * the internal encoding will be used.
  375. *
  376. * @return string. The uppercased string.
  377. */
  378. public static function strtoupper( $str, $use_enc = null )
  379. {
  380. $enc = self::$hab_enc;
  381. if ( $use_enc !== null ) {
  382. $enc = $use_enc;
  383. }
  384. if ( self::$use_library == self::USE_MBSTRING ) {
  385. $ret = mb_strtoupper( $str, $enc );
  386. }
  387. else {
  388. $ret = strtoupper( $str );
  389. }
  390. return $ret;
  391. }
  392. /**
  393. * Determines if the passed string is valid character data (according to mbstring)
  394. *
  395. * @param string $str the string to check
  396. * @return bool
  397. */
  398. public static function valid_data( $str, $use_enc = null )
  399. {
  400. $enc = self::$hab_enc;
  401. if ( $use_enc !== null ) {
  402. $enc = $use_enc;
  403. }
  404. if ( self::$use_library == self::USE_MBSTRING ) {
  405. return mb_check_encoding( $str, $enc );
  406. }
  407. return true;
  408. }
  409. /**
  410. * Makes a string's first character uppercase
  411. *
  412. * @see http://php.net/ucfirst
  413. * @param string $str The string to capitalize.
  414. * @param string $use_enc The encoding to be used. If null, the internal encoding will be used.
  415. * @return string The capitalized string.
  416. */
  417. public static function ucfirst ( $str, $use_enc = null )
  418. {
  419. $enc = self::$hab_enc;
  420. if ( $use_enc !== null ) {
  421. $enc = $use_enc;
  422. }
  423. if ( self::$use_library == self::USE_MBSTRING ) {
  424. // get the first character
  425. $first = self::substr( $str, 0, 1, $enc );
  426. // uppercase it
  427. $first = self::strtoupper( $first, $enc );
  428. // get the rest of the characters
  429. $last = self::substr( $str, 1, null, $enc );
  430. // put them back together
  431. $ret = $first . $last;
  432. }
  433. else {
  434. $ret = ucfirst( $str );
  435. }
  436. return $ret;
  437. }
  438. /**
  439. * Makes a string's first character lowercase
  440. *
  441. * @see http://php.net/ucfirst
  442. * @param string $str The string to lowercase.
  443. * @param string $use_enc The encoding to be used. If null, the internal encoding will be used.
  444. * @return string The lowercased string.
  445. */
  446. public static function lcfirst ( $str, $use_enc = null )
  447. {
  448. $enc = self::$hab_enc;
  449. if ( $use_enc !== null ) {
  450. $enc = $use_enc;
  451. }
  452. if ( self::$use_library == self::USE_MBSTRING ) {
  453. // get the first character
  454. $first = self::substr( $str, 0, 1, $enc );
  455. // lowercase it
  456. $first = self::strtolower( $first, $enc );
  457. // get the rest of the characters
  458. $last = self::substr( $str, 1, null, $enc );
  459. // put them back together
  460. $ret = $first . $last;
  461. }
  462. else {
  463. // lcfirst() is php 5.3+ so we'll emulate it
  464. $first = substr( $str, 0, 1 );
  465. $first = strtolower( $first );
  466. $last = substr( $str, 1 );
  467. $ret = $first . $last;
  468. }
  469. return $ret;
  470. }
  471. /**
  472. * Replace all occurrences of the search string with the replacement string.
  473. *
  474. * @see http://php.net/str_replace
  475. * @param mixed $search A string or an array of strings to search for.
  476. * @param mixed $replace A string or an array of strings to replace search values with.
  477. * @param string $subject The string to perform the search and replace on.
  478. * @param int $count If passed, this value will hold the number of matched and replaced needles.
  479. * @param string $use_enc The encoding to be used. If null, the internal encoding will be used.
  480. * @return string The subject with replaced values.
  481. */
  482. public static function str_replace ( $search, $replace, $subject, &$count = 0, $use_enc = null )
  483. {
  484. $enc = self::$hab_enc;
  485. if ( $use_enc !== null ) {
  486. $enc = $use_enc;
  487. }
  488. if ( self::$use_library == self::USE_MBSTRING ) {
  489. // if search is an array and replace is not, we need to make replace an array and pad it to the same number of values as search
  490. if ( is_array( $search ) && !is_array( $replace ) ) {
  491. $replace = array_fill( 0, count( $search ), $replace );
  492. }
  493. // if search is an array and replace is as well, we need to make sure replace has the same number of values - pad it with empty strings
  494. if ( is_array( $search ) && is_array( $replace ) ) {
  495. $replace = array_pad( $replace, count( $search ), '' );
  496. }
  497. // if search is not an array, make it one
  498. if ( !is_array( $search ) ) {
  499. $search = array( $search );
  500. }
  501. // if replace is not an array, make it one
  502. if ( !is_array( $replace ) ) {
  503. $replace = array( $replace );
  504. }
  505. // if subject is an array, recursively call ourselves on each element of it
  506. if ( is_array( $subject ) ) {
  507. foreach ( $subject as $k => $v ) {
  508. $subject[ $k ] = self::str_replace( $search, $replace, $v, $count, $use_enc );
  509. }
  510. return $subject;
  511. }
  512. // now we've got an array of characters and arrays of search / replace characters with the same values - loop and replace them!
  513. $search_count = count( $search ); // we modify $search, so we can't include it in the condition next
  514. for ( $i = 0; $i < $search_count; $i++ ) {
  515. // the values we'll match
  516. $s = array_shift( $search );
  517. $r = array_shift( $replace );
  518. // to avoid an infinite loop if you're replacing with a value that contains the subject we get the position of each instance first
  519. $positions = array();
  520. $offset = 0;
  521. while ( self::strpos( $subject, $s, $offset, $enc ) !== false ) {
  522. // get the position
  523. $pos = self::strpos( $subject, $s, $offset, $enc );
  524. // add it to the list
  525. $positions[] = $pos;
  526. // and set the offset to skip over this value
  527. $offset = $pos + self::strlen( $s, $enc );
  528. }
  529. // if we pick through from the beginning, our positions will change if the replacement string is longer
  530. // instead, we pick through from the last place
  531. $positions = array_reverse( $positions );
  532. // now that we've got the position of each one, just loop through that and replace them
  533. foreach ( $positions as $pos ) {
  534. // pull out the part before the string
  535. $before = self::substr( $subject, 0, $pos, $enc );
  536. // pull out the part after
  537. $after = self::substr( $subject, $pos + self::strlen( $s, $enc ), null, $enc );
  538. // now we have the string in two parts without the string we're searching for
  539. // put it back together with the replacement
  540. $subject = $before . $r . $after;
  541. // increment our count, a replacement was made
  542. $count++;
  543. }
  544. }
  545. }
  546. else {
  547. $subject = str_replace( $search, $replace, $subject, $count );
  548. }
  549. return $subject;
  550. }
  551. /**
  552. * Replace all occurrences of the search string with the replacement string.
  553. *
  554. * @see http://php.net/str_ireplace
  555. * @param mixed $search A string or an array of strings to search for.
  556. * @param mixed $replace A string or an array of strings to replace search values with.
  557. * @param string $subject The string to perform the search and replace on.
  558. * @param int $count If passed, this value will hold the number of matched and replaced needles.
  559. * @param string $use_enc The encoding to be used. If null, the internal encoding will be used.
  560. * @return string The subject with replaced values.
  561. */
  562. public static function str_ireplace( $search, $replace, $subject, &$count = 0, $use_enc = null )
  563. {
  564. $enc = self::$hab_enc;
  565. if ( $use_enc !== null ) {
  566. $enc = $use_enc;
  567. }
  568. if ( self::$use_library == self::USE_MBSTRING ) {
  569. // if search is an array and replace is not, we need to make replace an array and pad it to the same number of values as search
  570. if ( is_array( $search ) && !is_array( $replace ) ) {
  571. $replace = array_fill( 0, count( $search ), $replace );
  572. }
  573. // if search is an array and replace is as well, we need to make sure replace has the same number of values - pad it with empty strings
  574. if ( is_array( $search ) && is_array( $replace ) ) {
  575. $replace = array_pad( $replace, count( $search ), '' );
  576. }
  577. // if search is not an array, make it one
  578. if ( !is_array( $search ) ) {
  579. $search = array( $search );
  580. }
  581. // if replace is not an array, make it one
  582. if ( !is_array( $replace ) ) {
  583. $replace = array( $replace );
  584. }
  585. // if subject is an array, recursively call ourselves on each element of it
  586. if ( is_array( $subject ) ) {
  587. foreach ( $subject as $k => $v ) {
  588. $subject[ $k ] = self::str_ireplace( $search, $replace, $v, $count, $use_enc );
  589. }
  590. return $subject;
  591. }
  592. $search_count = count( $search ); // we modify $search, so we can't include it in the condition next
  593. for ( $i = 0; $i < $search_count; $i++ ) {
  594. // the values we'll match
  595. $s = array_shift( $search );
  596. $r = array_shift( $replace );
  597. // to avoid an infinite loop if you're replacing with a value that contains the subject we get the position of each instance first
  598. $positions = array();
  599. $offset = 0;
  600. while ( self::stripos( $subject, $s, $offset, $enc ) !== false ) {
  601. // get the position
  602. $pos = self::stripos( $subject, $s, $offset, $enc );
  603. // add it to the list
  604. $positions[] = $pos;
  605. // and set the offset to skip over this value
  606. $offset = $pos + self::strlen( $s, $enc );
  607. }
  608. // if we pick through from the beginning, our positions will change if the replacement string is longer
  609. // instead, we pick through from the last place
  610. $positions = array_reverse( $positions );
  611. // now that we've got the position of each one, just loop through that and replace them
  612. foreach ( $positions as $pos ) {
  613. // pull out the part before the string
  614. $before = self::substr( $subject, 0, $pos, $enc );
  615. // pull out the part after
  616. $after = self::substr( $subject, $pos + self::strlen( $s, $enc ), null, $enc );
  617. // now we have the string in two parts without the string we're searching for
  618. // put it back together with the replacement
  619. $subject = $before . $r . $after;
  620. // increment our count, a replacement was made
  621. $count++;
  622. }
  623. }
  624. }
  625. else {
  626. $subject = str_ireplace( $search, $replace, $subject, $count );
  627. }
  628. return $subject;
  629. }
  630. /**
  631. * Uppercase the first character of each word in a string.
  632. *
  633. * From php.net/ucwords:
  634. * The definition of a word is any string of characters that is immediately after a whitespace
  635. * (These are: space, form-feed, newline, carriage return, horizontal tab, and vertical tab).
  636. *
  637. * @see http://php.net/ucwords
  638. * @param string $str The input string.
  639. * @param string $use_enc The encoding to be used. If null, the internal encoding will be used.
  640. * @return string The modified string.
  641. */
  642. public static function ucwords ( $str, $use_enc = null )
  643. {
  644. $enc = self::$hab_enc;
  645. if ( $use_enc !== null ) {
  646. $enc = $use_enc;
  647. }
  648. if ( self::$use_library == self::USE_MBSTRING ) {
  649. $delimiters = array(
  650. chr( 32 ), // space
  651. chr( 12 ), // form-feed
  652. chr( 10 ), // newline
  653. chr( 13 ), // carriage return
  654. chr( 9 ), // horizontal tab
  655. chr( 11 ), // vertical tab
  656. );
  657. // loop through the delimiters and explode the string by each one
  658. foreach ( $delimiters as $d ) {
  659. $pieces = explode( $d, $str );
  660. for ( $i = 0; $i < count( $pieces ); $i++ ) {
  661. // capitalize each word
  662. $pieces[ $i ] = self::ucfirst( $pieces[ $i ], $enc );
  663. }
  664. // put the string back together
  665. $str = implode( $d, $pieces );
  666. }
  667. }
  668. else {
  669. $str = ucwords( $str );
  670. }
  671. return $str;
  672. }
  673. }
  674. ?>