PageRenderTime 41ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/classes/core/String.inc.php

https://github.com/michaeljoyce/pkp-lib
PHP | 807 lines | 515 code | 70 blank | 222 comment | 71 complexity | 8ca94e4b82f6401d49e4d8e7b1cc2082 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause
  1. <?php
  2. /**
  3. * @file classes/core/String.inc.php
  4. *
  5. * Copyright (c) 2000-2010 John Willinsky
  6. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  7. *
  8. * @class String
  9. * @ingroup core
  10. *
  11. * @brief String manipulation wrapper class.
  12. *
  13. */
  14. // $Id$
  15. /*
  16. * Perl-compatibile regular expression (PCRE) constants:
  17. * These are defined application-wide for consistency
  18. */
  19. /*
  20. * RFC-2396 URIs
  21. *
  22. * Thanks to the PEAR Validation package (Tomas V.V.Cox <cox@idecnet.com>,
  23. * Pierre-Alain Joye <pajoye@php.net>, Amir Mohammad Saied <amir@php.net>)
  24. *
  25. * Originally published under the "New BSD License"
  26. * http://www.opensource.org/licenses/bsd-license.php
  27. */
  28. define('PCRE_URI', '(?:([a-z][-+.a-z0-9]*):)?' . // Scheme
  29. '(?://' .
  30. '(?:((?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'();:\&=+$,])*)@)?' . // User
  31. '(?:((?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)*[a-z](?:[a-z0-9]+)?\.?)' . // Hostname
  32. '|([0-9]{1,3}(?:\.[0-9]{1,3}){3}))' . // IP Address
  33. '(?::([0-9]*))?)' . // Port
  34. '((?:/(?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'():@\&=+$,;])*)*/?)?' . // Path
  35. '(?:\?([^#]*))?' . // Query String
  36. '(?:\#((?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'();/?:@\&=+$,])*))?'); // Fragment
  37. // RFC-2822 email addresses
  38. define('PCRE_EMAIL_ADDRESS',
  39. '[-a-z0-9!#\$%&\'\*\+\/=\?\^_\`\{\|\}~]' . '+' . // One or more atom characters.
  40. '(\.' . '[-a-z0-9!#\$%&\'\*\+\/=\?\^_\`\{\|\}~]' . '+)*'. // Followed by zero or more dot separated sets of one or more atom characters.
  41. '@'. // Followed by an "at" character.
  42. '(' . '([a-z0-9]([-a-z0-9]*[a-z0-9]+)?)' . '{1,63}\.)+'. // Followed by one or max 63 domain characters (dot separated).
  43. '([a-z0-9]([-a-z0-9]*[a-z0-9]+)?)' . '{2,63}' // Must be followed by one set consisting a period of two or max 63 domain characters.
  44. );
  45. class String {
  46. /**
  47. * Perform initialization required for the string wrapper library.
  48. */
  49. function init() {
  50. $clientCharset = strtolower(Config::getVar('i18n', 'client_charset'));
  51. // Check if mbstring is installed (requires PHP >= 4.3.0)
  52. if (String::hasMBString()) {
  53. // mbstring routines are available
  54. define('ENABLE_MBSTRING', true);
  55. // Set up required ini settings for mbstring
  56. // FIXME Do any other mbstring settings need to be set?
  57. mb_internal_encoding($clientCharset);
  58. mb_substitute_character('63'); // question mark
  59. }
  60. // Define modifier to be used in regexp_* routines
  61. // FIXME Should non-UTF-8 encodings be supported with mbstring?
  62. if ($clientCharset == 'utf-8' && String::hasPCREUTF8()) {
  63. define('PCRE_UTF8', 'u');
  64. } else {
  65. define('PCRE_UTF8', '');
  66. }
  67. }
  68. /**
  69. * Check if server has the mbstring library.
  70. * Currently requires PHP >= 4.3.0 (for mb_strtolower, mb_strtoupper,
  71. * and mb_substr_count)
  72. * @return boolean
  73. */
  74. function hasMBString() {
  75. static $hasMBString;
  76. if (isset($hasMBString)) return $hasMBString;
  77. // If string overloading is active, it will break many of the
  78. // native implementations. mbstring.func_overload must be set
  79. // to 0, 1 or 4 in php.ini (string overloading disabled).
  80. if (ini_get('mbstring.func_overload') && defined('MB_OVERLOAD_STRING')) {
  81. $hasMBString = false;
  82. } else {
  83. $hasMBString = (
  84. extension_loaded('mbstring') &&
  85. function_exists('mb_strlen') &&
  86. function_exists('mb_strpos') &&
  87. function_exists('mb_strrpos') &&
  88. function_exists('mb_substr') &&
  89. function_exists('mb_strtolower') &&
  90. function_exists('mb_strtoupper') &&
  91. function_exists('mb_substr_count') &&
  92. function_exists('mb_send_mail')
  93. );
  94. }
  95. return $hasMBString;
  96. }
  97. /**
  98. * Check if server supports the PCRE_UTF8 modifier.
  99. * @return boolean
  100. */
  101. function hasPCREUTF8() {
  102. // The PCRE_UTF8 modifier is only supported on PHP >= 4.1.0 (*nix) or PHP >= 4.2.3 (win32)
  103. // Evil check to see if PCRE_UTF8 is supported
  104. if (@preg_match('//u', '')) {
  105. return true;
  106. } else {
  107. return false;
  108. }
  109. }
  110. //
  111. // Wrappers for basic string manipulation routines.
  112. // See the phputf8 documentation for usage.
  113. //
  114. /**
  115. * @see http://ca.php.net/manual/en/function.strlen.php
  116. */
  117. function strlen($string) {
  118. if (defined('ENABLE_MBSTRING')) {
  119. require_once 'mbstring/core.php';
  120. } else {
  121. require_once 'utils/unicode.php';
  122. require_once 'native/core.php';
  123. }
  124. return utf8_strlen($string);
  125. }
  126. /**
  127. * @see http://ca.php.net/manual/en/function.strpos.php
  128. */
  129. function strpos($haystack, $needle, $offset = 0) {
  130. if (defined('ENABLE_MBSTRING')) {
  131. require_once 'mbstring/core.php';
  132. } else {
  133. require_once 'utils/unicode.php';
  134. require_once 'native/core.php';
  135. }
  136. return utf8_strpos($haystack, $needle, $offset);
  137. }
  138. /**
  139. * @see http://ca.php.net/manual/en/function.strrpos.php
  140. */
  141. function strrpos($haystack, $needle) {
  142. if (defined('ENABLE_MBSTRING')) {
  143. require_once 'mbstring/core.php';
  144. } else {
  145. require_once 'utils/unicode.php';
  146. require_once 'native/core.php';
  147. }
  148. return utf8_strrpos($haystack, $needle, $offset);
  149. }
  150. /**
  151. * @see http://ca.php.net/manual/en/function.substr.php
  152. */
  153. function substr($string, $start, $length = false) {
  154. if (defined('ENABLE_MBSTRING')) {
  155. require_once 'mbstring/core.php';
  156. } else {
  157. require_once 'utils/unicode.php';
  158. require_once 'native/core.php';
  159. }
  160. return utf8_substr($string, $start, $length);
  161. }
  162. /**
  163. * @see http://ca.php.net/manual/en/function.strtolower.php
  164. */
  165. function strtolower($string) {
  166. if (defined('ENABLE_MBSTRING')) {
  167. require_once 'mbstring/core.php';
  168. } else {
  169. require_once 'utils/unicode.php';
  170. require_once 'native/core.php';
  171. }
  172. return utf8_strtolower($string);
  173. }
  174. /**
  175. * @see http://ca.php.net/manual/en/function.strtoupper.php
  176. */
  177. function strtoupper($string) {
  178. if (defined('ENABLE_MBSTRING')) {
  179. require_once 'mbstring/core.php';
  180. } else {
  181. require_once 'utils/unicode.php';
  182. require_once 'native/core.php';
  183. }
  184. return utf8_strtoupper($string);
  185. }
  186. /**
  187. * @see http://ca.php.net/manual/en/function.ucfirst.php
  188. */
  189. function ucfirst($string) {
  190. if (defined('ENABLE_MBSTRING')) {
  191. require_once 'mbstring/core.php';
  192. require_once 'ucfirst.php';
  193. } else {
  194. require_once 'utils/unicode.php';
  195. require_once 'native/core.php';
  196. require_once 'ucfirst.php';
  197. }
  198. return utf8_ucfirst($string);
  199. }
  200. /**
  201. * @see http://ca.php.net/manual/en/function.substr_count.php
  202. */
  203. function substr_count($haystack, $needle) {
  204. if (defined('ENABLE_MBSTRING')) {
  205. return mb_substr_count($haystack, $needle); // Requires PHP >= 4.3.0
  206. } else {
  207. return substr_count($haystack, $needle);
  208. }
  209. }
  210. /**
  211. * @see http://ca.php.net/manual/en/function.encode_mime_header.php
  212. */
  213. function encode_mime_header($string) {
  214. if (defined('ENABLE_MBSTRING')) {
  215. return mb_encode_mimeheader($string, mb_internal_encoding(), 'B', MAIL_EOL);
  216. } else {
  217. return $string;
  218. }
  219. }
  220. /**
  221. * @see http://ca.php.net/manual/en/function.mail.php
  222. */
  223. function mail($to, $subject, $message, $additional_headers = '', $additional_parameters = '') {
  224. // Cannot use mb_send_mail as it base64 encodes the whole body of the email,
  225. // making it useless for multipart emails
  226. if (empty($additional_parameters)) {
  227. return mail($to, $subject, $message, $additional_headers);
  228. } else {
  229. return mail($to, $subject, $message, $additional_headers, $additional_parameters);
  230. }
  231. }
  232. //
  233. // Wrappers for PCRE-compatible regular expression routines.
  234. // See the php.net documentation for usage.
  235. //
  236. /**
  237. * @see http://ca.php.net/manual/en/function.regexp_quote.php
  238. */
  239. function regexp_quote($string, $delimiter = '/') {
  240. return preg_quote($string, $delimiter);
  241. }
  242. /**
  243. * @see http://ca.php.net/manual/en/function.regexp_grep.php
  244. */
  245. function regexp_grep($pattern, $input) {
  246. if (PCRE_UTF8 && !String::utf8_compliant($input)) $input = String::utf8_bad_strip($input);
  247. return preg_grep($pattern . PCRE_UTF8, $input);
  248. }
  249. /**
  250. * @see http://ca.php.net/manual/en/function.regexp_match.php
  251. */
  252. function regexp_match($pattern, $subject) {
  253. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  254. return preg_match($pattern . PCRE_UTF8, $subject);
  255. }
  256. /**
  257. * @see http://ca.php.net/manual/en/function.regexp_match_get.php
  258. */
  259. function regexp_match_get($pattern, $subject, &$matches) {
  260. // NOTE: This function was created since PHP < 5.x does not support optional reference parameters
  261. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  262. return preg_match($pattern . PCRE_UTF8, $subject, $matches);
  263. }
  264. /**
  265. * @see http://ca.php.net/manual/en/function.regexp_match_all.php
  266. */
  267. function regexp_match_all($pattern, $subject, &$matches) {
  268. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  269. return preg_match_all($pattern . PCRE_UTF8, $subject, $matches);
  270. }
  271. /**
  272. * @see http://ca.php.net/manual/en/function.regexp_replace.php
  273. */
  274. function regexp_replace($pattern, $replacement, $subject, $limit = -1) {
  275. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  276. return preg_replace($pattern . PCRE_UTF8, $replacement, $subject, $limit);
  277. }
  278. /**
  279. * @see http://ca.php.net/manual/en/function.regexp_replace_callback.php
  280. */
  281. function regexp_replace_callback($pattern, $callback, $subject, $limit = -1) {
  282. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  283. return preg_replace_callback($pattern . PCRE_UTF8, $callback, $subject, $limit);
  284. }
  285. /**
  286. * @see http://ca.php.net/manual/en/function.regexp_split.php
  287. */
  288. function regexp_split($pattern, $subject, $limit = -1) {
  289. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  290. return preg_split($pattern . PCRE_UTF8, $subject, $limit);
  291. }
  292. /**
  293. * @see http://ca.php.net/manual/en/function.mime_content_type.php
  294. */
  295. function mime_content_type($filename) {
  296. if (function_exists('mime_content_type')) {
  297. $result = mime_content_type($filename);
  298. // mime_content_type appears to return a charset
  299. // (erroneously?) in recent versions of PHP5
  300. if (($i = strpos($result, ';')) !== false) {
  301. $result = trim(substr($result, 0, $i));
  302. }
  303. return $result;
  304. } elseif (function_exists('finfo_open')) {
  305. $localeFiles =& Registry::get('fileInfo', true, null);
  306. if ($fi === null) {
  307. $fi = finfo_open(FILEINFO_MIME, Config::getVar('finfo', 'mime_database_path'));
  308. }
  309. if ($fi !== false) {
  310. return strtok(finfo_file($fi, $filename), ' ;');
  311. }
  312. }
  313. // Fall back on an external "file" tool
  314. $f = escapeshellarg($filename);
  315. $result = trim(`file --brief --mime $f`);
  316. // Make sure we just return the mime type.
  317. if (($i = strpos($result, ';')) !== false) {
  318. $result = trim(substr($result, 0, $i));
  319. }
  320. return $result;
  321. }
  322. /**
  323. * Strip unsafe HTML from the input text. Covers XSS attacks like scripts,
  324. * onclick(...) attributes, javascript: urls, and special characters.
  325. * @param $input string input string
  326. * @return string
  327. */
  328. function stripUnsafeHtml($input) {
  329. // Parts of this implementation were taken from Horde:
  330. // see http://cvs.horde.org/co.php/framework/MIME/MIME/Viewer/html.php.
  331. $allowedHtml = Config::getVar('security', 'allowed_html');
  332. if ($allowedHtml == '') $allowedHtml = '<a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd> <b> <i> <u> <img> <sup> <sub> <br> <p>';
  333. $html = strip_tags($input, $allowedHtml);
  334. // Change space entities to space characters
  335. $html = preg_replace('/&#(x0*20|0*32);?/i', ' ', $html);
  336. // Remove non-printable characters
  337. $html = preg_replace('/&#x?0*([9A-D]|1[0-3]);/i', '&nbsp;', $html);
  338. $html = preg_replace('/&#x?0*[9A-D]([^0-9A-F]|$)/i', '&nbsp\\1', $html);
  339. $html = preg_replace('/&#0*(9|1[0-3])([^0-9]|$)/i', '&nbsp\\2', $html);
  340. // Remove overly long numeric entities
  341. $html = preg_replace('/&#x?0*[0-9A-F]{6,};?/i', '&nbsp;', $html);
  342. /* Get all attribute="javascript:foo()" tags. This is
  343. * essentially the regex /(=|url\()("?)[^>]* script:/ but
  344. * expanded to catch camouflage with spaces and entities. */
  345. $preg = '/((&#0*61;?|&#x0*3D;?|=)|'
  346. . '((u|&#0*85;?|&#x0*55;?|&#0*117;?|&#x0*75;?)\s*'
  347. . '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*'
  348. . '(l|&#0*76;?|&#x0*4c;?|&#0*108;?|&#x0*6c;?)\s*'
  349. . '(\()))\s*'
  350. . '(&#0*34;?|&#x0*22;?|"|&#0*39;?|&#x0*27;?|\')?'
  351. . '[^>]*\s*'
  352. . '(s|&#0*83;?|&#x0*53;?|&#0*115;?|&#x0*73;?)\s*'
  353. . '(c|&#0*67;?|&#x0*43;?|&#0*99;?|&#x0*63;?)\s*'
  354. . '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*'
  355. . '(i|&#0*73;?|&#x0*49;?|&#0*105;?|&#x0*69;?)\s*'
  356. . '(p|&#0*80;?|&#x0*50;?|&#0*112;?|&#x0*70;?)\s*'
  357. . '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*'
  358. . '(:|&#0*58;?|&#x0*3a;?)/i';
  359. $html = preg_replace($preg, '\1\8PKPCleaned', $html);
  360. /* Get all on<foo>="bar()". NEVER allow these. */
  361. $html = preg_replace('/([\s"\']+'
  362. . '(o|&#0*79;?|&#0*4f;?|&#0*111;?|&#0*6f;?)'
  363. . '(n|&#0*78;?|&#0*4e;?|&#0*110;?|&#0*6e;?)'
  364. . '\w+)\s*=/i', '\1PKPCleaned=', $html);
  365. $pattern = array(
  366. '|<([^>]*)&{.*}([^>]*)>|',
  367. '|<([^>]*)mocha:([^>]*)>|i',
  368. '|<([^>]*)binding:([^>]*)>|i'
  369. );
  370. $replace = array('<&{;}\3>', '<\1PKPCleaned:\2>', '<\1PKPCleaned:\2>');
  371. $html = preg_replace($pattern, $replace, $html);
  372. return $html;
  373. }
  374. //
  375. // Wrappers for UTF-8 validation routines
  376. // See the phputf8 documentation for usage.
  377. //
  378. /**
  379. * Detect whether a string contains non-ascii multibyte sequences in the UTF-8 range
  380. * @param $str string input string
  381. * @return boolean
  382. */
  383. function utf8_is_valid($str) {
  384. require_once 'utils/validation.php';
  385. return utf8_is_valid($str);
  386. }
  387. /**
  388. * Tests whether a string complies as UTF-8; faster and less strict than utf8_is_valid
  389. * see lib/phputf8/utils/validation.php for more details
  390. * @param $str string input string
  391. * @return boolean
  392. */
  393. function utf8_compliant($str) {
  394. require_once 'utils/validation.php';
  395. return utf8_compliant($str);
  396. }
  397. /**
  398. * Locates the first bad byte in a UTF-8 string returning it's byte index in the string
  399. * @param $str string input string
  400. * @return string
  401. */
  402. function utf8_bad_find($str) {
  403. require_once 'utils/bad.php';
  404. return utf8_bad_find($str);
  405. }
  406. /**
  407. * Strips out any bad bytes from a UTF-8 string and returns the rest
  408. * @param $str string input string
  409. * @return string
  410. */
  411. function utf8_bad_strip($str) {
  412. require_once 'utils/bad.php';
  413. return utf8_bad_strip($str);
  414. }
  415. /**
  416. * Replace bad bytes with an alternative character - ASCII character
  417. * @param $str string input string
  418. * @param $replace string optional
  419. * @return string
  420. */
  421. function utf8_bad_replace($str, $replace = '?') {
  422. require_once 'utils/bad.php';
  423. return utf8_bad_replace($str, $replace);
  424. }
  425. /**
  426. * Replace bad bytes with an alternative character - ASCII character
  427. * @param $str string input string
  428. * @return string
  429. */
  430. function utf8_strip_ascii_ctrl($str) {
  431. require_once 'utils/ascii.php';
  432. return utf8_strip_ascii_ctrl($str);
  433. }
  434. /**
  435. * Normalize a string in an unknown (non-UTF8) encoding into a valid UTF-8 sequence
  436. * @param $str string input string
  437. * @return string
  438. */
  439. function utf8_normalize($str) {
  440. import('core.Transcoder');
  441. if (String::hasMBString()) {
  442. // NB: CP-1252 often segfaults; we've left it out here but it will detect as 'ISO-8859-1'
  443. $mb_encoding_order = 'UTF-8, UTF-7, ASCII, ISO-8859-1, EUC-JP, SJIS, eucJP-win, SJIS-win, JIS, ISO-2022-JP';
  444. if (checkPhpVersion('4.3.8')) {
  445. $detected_encoding = mb_detect_encoding($str, $mb_encoding_order, FALSE);
  446. } else {
  447. $detected_encoding = mb_detect_encoding($str, $mb_encoding_order);
  448. }
  449. } elseif (function_exists('iconv') && strlen(iconv('CP1252', 'UTF-8', $str)) != strlen(iconv('ISO-8859-1', 'UTF-8', $str))) {
  450. // use iconv to detect CP-1252, assuming default ISO-8859-1
  451. $detected_encoding = 'CP1252';
  452. } else {
  453. // assume ISO-8859-1, PHP default
  454. $detected_encoding = 'ISO-8859-1';
  455. }
  456. // transcode CP-1252/ISO-8859-1 into HTML entities; this works because CP-1252 is mapped onto ISO-8859-1
  457. if ('ISO-8859-1' == $detected_encoding || 'CP1252' == $detected_encoding) {
  458. $trans = new Transcoder('CP1252', 'HTML-ENTITIES');
  459. $str = $trans->trans($str);
  460. }
  461. // transcode from detected encoding to to UTF-8
  462. $trans = new Transcoder($detected_encoding, 'UTF-8');
  463. $str = $trans->trans($str);
  464. return $str;
  465. }
  466. /**
  467. * US-ASCII transliterations of Unicode text
  468. * @param $str string input string
  469. * @return string
  470. */
  471. function utf8_to_ascii($str) {
  472. require_once('utf8_to_ascii.php');
  473. return utf8_to_ascii($str);
  474. }
  475. /**
  476. * Returns the UTF-8 string corresponding to the unicode value
  477. * Does not require any multibyte PHP libraries
  478. * (from php.net, courtesy - romans@void.lv)
  479. * @param $num int
  480. * @return string
  481. */
  482. function code2utf ($num) {
  483. if ($num < 128) return chr($num);
  484. if ($num < 2048) return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  485. if ($num < 65536) return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  486. if ($num < 2097152) return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  487. return '';
  488. }
  489. /**
  490. * Convert UTF-8 encoded characters in a string to escaped HTML entities
  491. * This is a helper function for transcoding into HTML or XML for output
  492. * @param $str string input string
  493. * @return string
  494. */
  495. function utf2html ($str) {
  496. $ret = "";
  497. $max = strlen($str);
  498. $last = 0; // keeps the index of the last regular character
  499. for ($i=0; $i<$max; $i++) {
  500. $c = $str{$i};
  501. $c1 = ord($c);
  502. if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode
  503. $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
  504. $c1 &= 31; // remove the 3 bit two bytes prefix
  505. $c2 = ord($str{++$i}); // the next byte
  506. $c2 &= 63; // remove the 2 bit trailing byte prefix
  507. $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
  508. $c1 >>= 2; // c1 shifts 2 to the right
  509. $ret .= "&#" . ($c1 * 0x100 + $c2) . ";"; // this is the fastest string concatenation
  510. $last = $i+1;
  511. }
  512. elseif ($c1>>4 == 14) { // 1110 xxxx, 110 prefix for 3 bytes unicode
  513. $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
  514. $c2 = ord($str{++$i}); // the next byte
  515. $c3 = ord($str{++$i}); // the third byte
  516. $c1 &= 15; // remove the 4 bit three bytes prefix
  517. $c2 &= 63; // remove the 2 bit trailing byte prefix
  518. $c3 &= 63; // remove the 2 bit trailing byte prefix
  519. $c3 |= (($c2 & 3) << 6); // last 2 bits of c2 become first 2 of c3
  520. $c2 >>=2; //c2 shifts 2 to the right
  521. $c2 |= (($c1 & 15) << 4); // last 4 bits of c1 become first 4 of c2
  522. $c1 >>= 4; // c1 shifts 4 to the right
  523. $ret .= '&#' . (($c1 * 0x10000) + ($c2 * 0x100) + $c3) . ';'; // this is the fastest string concatenation
  524. $last = $i+1;
  525. }
  526. }
  527. $str=$ret . substr($str, $last, $i); // append the last batch of regular characters
  528. return $str;
  529. }
  530. /**
  531. * Convert numeric HTML entities in a string to UTF-8 encoded characters
  532. * This is a native alternative to the buggy html_entity_decode() using UTF8
  533. * @param $str string input string
  534. * @return string
  535. */
  536. function html2utf($str) {
  537. // convert named entities to numeric entities
  538. $str = strtr($str, String::getHTMLEntities());
  539. // use PCRE-aware replace function to replace numeric entities
  540. $str = String::regexp_replace('~&#x([0-9a-f]+);~ei', 'String::code2utf(hexdec("\\1"))', $str);
  541. $str = String::regexp_replace('~&#([0-9]+);~e', 'String::code2utf(\\1)', $str);
  542. return $str;
  543. }
  544. /**
  545. * Return an associative array of named->numeric HTML entities
  546. * Required to support HTML functions without objects in PHP4/PHP5
  547. * From php.net: function.get-html-translation-table.php
  548. * @return string
  549. */
  550. function getHTMLEntities () {
  551. // define the conversion table
  552. $html_entities = array(
  553. "&Aacute;" => "&#193;", "&aacute;" => "&#225;", "&Acirc;" => "&#194;",
  554. "&acirc;" => "&#226;", "&acute;" => "&#180;", "&AElig;" => "&#198;",
  555. "&aelig;" => "&#230;", "&Agrave;" => "&#192;", "&agrave;" => "&#224;",
  556. "&alefsym;" => "&#8501;","&Alpha;" => "&#913;", "&alpha;" => "&#945;",
  557. "&amp;" => "&#38;", "&and;" => "&#8743;", "&ang;" => "&#8736;",
  558. "&apos;" => "&#39;", "&Aring;" => "&#197;", "&aring;" => "&#229;",
  559. "&asymp;" => "&#8776;", "&Atilde;" => "&#195;", "&atilde;" => "&#227;",
  560. "&Auml;" => "&#196;", "&auml;" => "&#228;", "&bdquo;" => "&#8222;",
  561. "&Beta;" => "&#914;", "&beta;" => "&#946;", "&brvbar;" => "&#166;",
  562. "&bull;" => "&#8226;", "&cap;" => "&#8745;", "&Ccedil;" => "&#199;",
  563. "&ccedil;" => "&#231;", "&cedil;" => "&#184;", "&cent;" => "&#162;",
  564. "&Chi;" => "&#935;", "&chi;" => "&#967;", "&circ;" => "&#94;",
  565. "&clubs;" => "&#9827;", "&cong;" => "&#8773;", "&copy;" => "&#169;",
  566. "&crarr;" => "&#8629;", "&cup;" => "&#8746;", "&curren;" => "&#164;",
  567. "&dagger;" => "&#8224;","&Dagger;" => "&#8225;", "&darr;" => "&#8595;",
  568. "&dArr;" => "&#8659;", "&deg;" => "&#176;", "&Delta;" => "&#916;",
  569. "&delta;" => "&#948;", "&diams;" => "&#9830;", "&divide;" => "&#247;",
  570. "&Eacute;" => "&#201;", "&eacute;" => "&#233;", "&Ecirc;" => "&#202;",
  571. "&ecirc;" => "&#234;", "&Egrave;" => "&#200;", "&egrave;" => "&#232;",
  572. "&empty;" => "&#8709;", "&emsp;" => "&#8195;", "&ensp;" => "&#8194;",
  573. "&Epsilon;" => "&#917;","&epsilon;" => "&#949;","&equiv;" => "&#8801;",
  574. "&Eta;" => "&#919;", "&eta;" => "&#951;", "&ETH;" => "&#208;",
  575. "&eth;" => "&#240;", "&Euml;" => "&#203;", "&euml;" => "&#235;",
  576. "&euro;" => "&#8364;", "&exist;" => "&#8707;", "&fnof;" => "&#402;",
  577. "&forall;" => "&#8704;","&frac12;" => "&#189;", "&frac14;" => "&#188;",
  578. "&frac34;" => "&#190;", "&frasl;" => "&#8260;", "&Gamma;" => "&#915;",
  579. "&gamma;" => "&#947;", "&ge;" => "&#8805;", "&gt;" => "&#62;",
  580. "&harr;" => "&#8596;", "&hArr;" => "&#8660;", "&hearts;" => "&#9829;",
  581. "&hellip;" => "&#8230;","&Iacute;" => "&#205;", "&iacute;" => "&#237;",
  582. "&Icirc;" => "&#206;", "&icirc;" => "&#238;", "&iexcl;" => "&#161;",
  583. "&Igrave;" => "&#204;", "&igrave;" => "&#236;", "&image;" => "&#8465;",
  584. "&infin;" => "&#8734;", "&int;" => "&#8747;", "&Iota;" => "&#921;",
  585. "&iota;" => "&#953;", "&iquest;" => "&#191;", "&isin;" => "&#8712;",
  586. "&Iuml;" => "&#207;", "&iuml;" => "&#239;", "&Kappa;" => "&#922;",
  587. "&kappa;" => "&#954;", "&Lambda;" => "&#923;", "&lambda;" => "&#955;",
  588. "&lang;" => "&#9001;", "&laquo;" => "&#171;", "&larr;" => "&#8592;",
  589. "&lArr;" => "&#8656;", "&lceil;" => "&#8968;",
  590. "&ldquo;" => "&#8220;", "&le;" => "&#8804;", "&lfloor;" => "&#8970;",
  591. "&lowast;" => "&#8727;","&loz;" => "&#9674;", "&lrm;" => "&#8206;",
  592. "&lsaquo;" => "&#8249;","&lsquo;" => "&#8216;", "&lt;" => "&#60;",
  593. "&macr;" => "&#175;", "&mdash;" => "&#8212;", "&micro;" => "&#181;",
  594. "&middot;" => "&#183;", "&minus;" => "&#45;", "&Mu;" => "&#924;",
  595. "&mu;" => "&#956;", "&nabla;" => "&#8711;", "&nbsp;" => "&#160;",
  596. "&ndash;" => "&#8211;", "&ne;" => "&#8800;", "&ni;" => "&#8715;",
  597. "&not;" => "&#172;", "&notin;" => "&#8713;", "&nsub;" => "&#8836;",
  598. "&Ntilde;" => "&#209;", "&ntilde;" => "&#241;", "&Nu;" => "&#925;",
  599. "&nu;" => "&#957;", "&Oacute;" => "&#211;", "&oacute;" => "&#243;",
  600. "&Ocirc;" => "&#212;", "&ocirc;" => "&#244;", "&OElig;" => "&#338;",
  601. "&oelig;" => "&#339;", "&Ograve;" => "&#210;", "&ograve;" => "&#242;",
  602. "&oline;" => "&#8254;", "&Omega;" => "&#937;", "&omega;" => "&#969;",
  603. "&Omicron;" => "&#927;","&omicron;" => "&#959;","&oplus;" => "&#8853;",
  604. "&or;" => "&#8744;", "&ordf;" => "&#170;", "&ordm;" => "&#186;",
  605. "&Oslash;" => "&#216;", "&oslash;" => "&#248;", "&Otilde;" => "&#213;",
  606. "&otilde;" => "&#245;", "&otimes;" => "&#8855;","&Ouml;" => "&#214;",
  607. "&ouml;" => "&#246;", "&para;" => "&#182;", "&part;" => "&#8706;",
  608. "&permil;" => "&#8240;","&perp;" => "&#8869;", "&Phi;" => "&#934;",
  609. "&phi;" => "&#966;", "&Pi;" => "&#928;", "&pi;" => "&#960;",
  610. "&piv;" => "&#982;", "&plusmn;" => "&#177;", "&pound;" => "&#163;",
  611. "&prime;" => "&#8242;", "&Prime;" => "&#8243;", "&prod;" => "&#8719;",
  612. "&prop;" => "&#8733;", "&Psi;" => "&#936;", "&psi;" => "&#968;",
  613. "&quot;" => "&#34;", "&radic;" => "&#8730;", "&rang;" => "&#9002;",
  614. "&raquo;" => "&#187;", "&rarr;" => "&#8594;", "&rArr;" => "&#8658;",
  615. "&rceil;" => "&#8969;", "&rdquo;" => "&#8221;", "&real;" => "&#8476;",
  616. "&reg;" => "&#174;", "&rfloor;" => "&#8971;","&Rho;" => "&#929;",
  617. "&rho;" => "&#961;", "&rlm;" => "&#8207;", "&rsaquo;" => "&#8250;",
  618. "&rsquo;" => "&#8217;", "&sbquo;" => "&#8218;", "&Scaron;" => "&#352;",
  619. "&scaron;" => "&#353;", "&sdot;" => "&#8901;", "&sect;" => "&#167;",
  620. "&shy;" => "&#173;", "&Sigma;" => "&#931;", "&sigma;" => "&#963;",
  621. "&sigmaf;" => "&#962;", "&sim;" => "&#8764;", "&spades;" => "&#9824;",
  622. "&sub;" => "&#8834;", "&sube;" => "&#8838;", "&sum;" => "&#8721;",
  623. "&sup1;" => "&#185;", "&sup2;" => "&#178;", "&sup3;" => "&#179;",
  624. "&sup;" => "&#8835;", "&supe;" => "&#8839;", "&szlig;" => "&#223;",
  625. "&Tau;" => "&#932;", "&tau;" => "&#964;", "&there4;" => "&#8756;",
  626. "&Theta;" => "&#920;", "&theta;" => "&#952;", "&thetasym;" => "&#977;",
  627. "&thinsp;" => "&#8201;","&THORN;" => "&#222;", "&thorn;" => "&#254;",
  628. "&tilde;" => "&#126;", "&times;" => "&#215;", "&trade;" => "&#8482;",
  629. "&Uacute;" => "&#218;", "&uacute;" => "&#250;", "&uarr;" => "&#8593;",
  630. "&uArr;" => "&#8657;", "&Ucirc;" => "&#219;", "&ucirc;" => "&#251;",
  631. "&Ugrave;" => "&#217;", "&ugrave;" => "&#249;", "&uml;" => "&#168;",
  632. "&upsih;" => "&#978;", "&Upsilon;" => "&#933;","&upsilon;" => "&#965;",
  633. "&Uuml;" => "&#220;", "&uuml;" => "&#252;", "&weierp;" => "&#8472;",
  634. "&Xi;" => "&#926;", "&xi;" => "&#958;", "&Yacute;" => "&#221;",
  635. "&yacute;" => "&#253;", "&yen;" => "&#165;", "&yuml;" => "&#255;",
  636. "&Yuml;" => "&#376;", "&Zeta;" => "&#918;", "&zeta;" => "&#950;",
  637. "&zwj;" => "&#8205;", "&zwnj;" => "&#8204;"
  638. );
  639. return $html_entities;
  640. }
  641. /**
  642. * Wrapper around fputcsv for systems that may or may not support it
  643. * (i.e. PHP before 5.1.0); see PHP documentation for fputcsv.
  644. */
  645. function fputcsv(&$handle, $fields = array(), $delimiter = ',', $enclosure = '"') {
  646. // From PHP website, thanks to boefje at hotmail dot com
  647. if (function_exists('fputcsv')) {
  648. return fputcsv($handle, $fields, $delimiter, $enclosure);
  649. }
  650. $str = '';
  651. $escape_char = '\\';
  652. foreach ($fields as $value) {
  653. if ( strpos($value, $delimiter) !== false ||
  654. strpos($value, $enclosure) !== false ||
  655. strpos($value, "\n") !== false ||
  656. strpos($value, "\r") !== false ||
  657. strpos($value, "\t") !== false ||
  658. strpos($value, ' ') !== false
  659. ) {
  660. $str2 = $enclosure;
  661. $escaped = 0;
  662. $len = strlen($value);
  663. for ($i=0; $i<$len; $i++) {
  664. if ($value[$i] == $escape_char) $escaped = 1;
  665. elseif (!$escaped && $value[$i] == $enclosure) $str2 .= $enclosure;
  666. else $escaped = 0;
  667. $str2 .= $value[$i];
  668. }
  669. $str2 .= $enclosure;
  670. $str .= $str2 . $delimiter;
  671. } else {
  672. $str .= $value . $delimiter;
  673. }
  674. }
  675. $str = substr($str, 0, -1);
  676. $str .= "\n";
  677. return fwrite($handle, $str);
  678. }
  679. /**
  680. * Trim punctuation from a string
  681. * @param $string string input string
  682. * @return string the trimmed string
  683. */
  684. function trimPunctuation($string) {
  685. return trim($string, ' ,.;:!?()[]\\/');
  686. }
  687. /**
  688. * Convert a string to proper title case
  689. * @param $title string
  690. * @return string
  691. */
  692. function titleCase($title) {
  693. $smallWords = array(
  694. 'of', 'a', 'the', 'and', 'an', 'or', 'nor', 'but', 'is', 'if', 'then',
  695. 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out',
  696. 'over', 'to', 'into', 'with'
  697. );
  698. $words = explode(' ', $title);
  699. foreach ($words as $key => $word) {
  700. if ($key == 0 or !in_array(self::strtolower($word), $smallWords)) {
  701. $words[$key] = ucfirst(self::strtolower($word));
  702. } else {
  703. $words[$key] = self::strtolower($word);
  704. }
  705. }
  706. $newTitle = implode(' ', $words);
  707. return $newTitle;
  708. }
  709. /**
  710. * Iterate over an array of delimiters and see whether
  711. * it exists in the given input string. If so, then use
  712. * it to explode the string into an array.
  713. * @param $delimiters array
  714. * @param $input string
  715. * @return array
  716. */
  717. function iterativeExplode($delimiters, $input) {
  718. // Run through the delimiters and try them out
  719. // one by one.
  720. foreach($delimiters as $delimiter) {
  721. if (strstr($input, $delimiter) !== false) {
  722. return explode($delimiter, $input);
  723. }
  724. }
  725. // If none of the delimiters works then return
  726. // the original string as an array.
  727. return (array($input));
  728. }
  729. }
  730. ?>