PageRenderTime 55ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/pkp/classes/core/String.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 937 lines | 619 code | 82 blank | 236 comment | 83 complexity | 8f6d50b982ce52ccbdb969fea681cdc4 MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @file classes/core/String.inc.php
  4. *
  5. * Copyright (c) 2000-2012 John Willinsky
  6. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  7. *
  8. * @class String
  9. * @ingroup core
  10. *
  11. * @brief String manipulation wrapper class.
  12. *
  13. */
  14. // $Id$
  15. /*
  16. * Perl-compatibile regular expression (PCRE) constants:
  17. * These are defined application-wide for consistency
  18. */
  19. /*
  20. * RFC-2396 URIs
  21. *
  22. * Thanks to the PEAR Validation package (Tomas V.V.Cox <cox@idecnet.com>,
  23. * Pierre-Alain Joye <pajoye@php.net>, Amir Mohammad Saied <amir@php.net>)
  24. *
  25. * Originally published under the "New BSD License"
  26. * http://www.opensource.org/licenses/bsd-license.php
  27. */
  28. define('PCRE_URI', '(?:([a-z][-+.a-z0-9]*):)?' . // Scheme
  29. '(?://' .
  30. '(?:((?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'();:\&=+$,])*)@)?' . // User
  31. '(?:((?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)*[a-z](?:[a-z0-9]+)?\.?)' . // Hostname
  32. '|([0-9]{1,3}(?:\.[0-9]{1,3}){3}))' . // IP Address
  33. '(?::([0-9]*))?)' . // Port
  34. '((?:/(?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'():@\&=+$,;])*)*/?)?' . // Path
  35. '(?:\?([^#]*))?' . // Query String
  36. '(?:\#((?:%[0-9a-f]{2}|[-a-z0-9_.!~*\'();/?:@\&=+$,])*))?'); // Fragment
  37. // RFC-2822 email addresses
  38. define('PCRE_EMAIL_ADDRESS',
  39. '[-a-z0-9!#\$%&\'\*\+\/=\?\^_\`\{\|\}~]' . '+' . // One or more atom characters.
  40. '(\.' . '[-a-z0-9!#\$%&\'\*\+\/=\?\^_\`\{\|\}~]' . '+)*'. // Followed by zero or more dot separated sets of one or more atom characters.
  41. '@'. // Followed by an "at" character.
  42. '(' . '([a-z0-9]([-a-z0-9]*[a-z0-9]+)?)' . '{1,63}\.)+'. // Followed by one or max 63 domain characters (dot separated).
  43. '([a-z0-9]([-a-z0-9]*[a-z0-9]+)?)' . '{2,63}' // Must be followed by one set consisting a period of two or max 63 domain characters.
  44. );
  45. // Two different types of camel case: one for class names and one for method names
  46. define ('CAMEL_CASE_HEAD_UP', 0x01);
  47. define ('CAMEL_CASE_HEAD_DOWN', 0x02);
  48. define('DEFAULT_ALLOWED_HTML', '<a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd> <b> <i> <u> <img src|alt> <sup> <sub> <br> <p>');
  49. class String {
  50. /**
  51. * Perform initialization required for the string wrapper library.
  52. */
  53. function init() {
  54. $clientCharset = strtolower(Config::getVar('i18n', 'client_charset'));
  55. // Check if mbstring is installed (requires PHP >= 4.3.0)
  56. if (String::hasMBString()) {
  57. // mbstring routines are available
  58. define('ENABLE_MBSTRING', true);
  59. // Set up required ini settings for mbstring
  60. // FIXME Do any other mbstring settings need to be set?
  61. mb_internal_encoding($clientCharset);
  62. mb_substitute_character('63'); // question mark
  63. }
  64. // Define modifier to be used in regexp_* routines
  65. // FIXME Should non-UTF-8 encodings be supported with mbstring?
  66. if ($clientCharset == 'utf-8' && String::hasPCREUTF8()) {
  67. define('PCRE_UTF8', 'u');
  68. } else {
  69. define('PCRE_UTF8', '');
  70. }
  71. if (checkPhpVersion('5.0.5')) {
  72. define('USE_HTML_PURIFIER', 1);
  73. }
  74. }
  75. /**
  76. * Check if server has the mbstring library.
  77. * Currently requires PHP >= 4.3.0 (for mb_strtolower, mb_strtoupper,
  78. * and mb_substr_count)
  79. * @return boolean
  80. */
  81. function hasMBString() {
  82. static $hasMBString;
  83. if (isset($hasMBString)) return $hasMBString;
  84. // If string overloading is active, it will break many of the
  85. // native implementations. mbstring.func_overload must be set
  86. // to 0, 1 or 4 in php.ini (string overloading disabled).
  87. if (ini_get('mbstring.func_overload') && defined('MB_OVERLOAD_STRING')) {
  88. $hasMBString = false;
  89. } else {
  90. $hasMBString = (
  91. extension_loaded('mbstring') &&
  92. function_exists('mb_strlen') &&
  93. function_exists('mb_strpos') &&
  94. function_exists('mb_strrpos') &&
  95. function_exists('mb_substr') &&
  96. function_exists('mb_strtolower') &&
  97. function_exists('mb_strtoupper') &&
  98. function_exists('mb_substr_count') &&
  99. function_exists('mb_send_mail')
  100. );
  101. }
  102. return $hasMBString;
  103. }
  104. /**
  105. * Check if server supports the PCRE_UTF8 modifier.
  106. * @return boolean
  107. */
  108. function hasPCREUTF8() {
  109. // The PCRE_UTF8 modifier is only supported on PHP >= 4.1.0 (*nix) or PHP >= 4.2.3 (win32)
  110. // Evil check to see if PCRE_UTF8 is supported
  111. if (@preg_match('//u', '')) {
  112. return true;
  113. } else {
  114. return false;
  115. }
  116. }
  117. //
  118. // Wrappers for basic string manipulation routines.
  119. // See the phputf8 documentation for usage.
  120. //
  121. /**
  122. * @see http://ca.php.net/manual/en/function.strlen.php
  123. */
  124. function strlen($string) {
  125. if (defined('ENABLE_MBSTRING')) {
  126. require_once 'mbstring/core.php';
  127. } else {
  128. require_once 'utils/unicode.php';
  129. require_once 'native/core.php';
  130. }
  131. return utf8_strlen($string);
  132. }
  133. /**
  134. * @see http://ca.php.net/manual/en/function.strpos.php
  135. */
  136. function strpos($haystack, $needle, $offset = 0) {
  137. if (defined('ENABLE_MBSTRING')) {
  138. require_once 'mbstring/core.php';
  139. } else {
  140. require_once 'utils/unicode.php';
  141. require_once 'native/core.php';
  142. }
  143. return utf8_strpos($haystack, $needle, $offset);
  144. }
  145. /**
  146. * @see http://ca.php.net/manual/en/function.strrpos.php
  147. */
  148. function strrpos($haystack, $needle) {
  149. if (defined('ENABLE_MBSTRING')) {
  150. require_once 'mbstring/core.php';
  151. } else {
  152. require_once 'utils/unicode.php';
  153. require_once 'native/core.php';
  154. }
  155. return utf8_strrpos($haystack, $needle, $offset);
  156. }
  157. /**
  158. * @see http://ca.php.net/manual/en/function.substr.php
  159. */
  160. function substr($string, $start, $length = false) {
  161. if (defined('ENABLE_MBSTRING')) {
  162. require_once 'mbstring/core.php';
  163. } else {
  164. require_once 'utils/unicode.php';
  165. require_once 'native/core.php';
  166. }
  167. return utf8_substr($string, $start, $length);
  168. }
  169. /**
  170. * @see http://ca.php.net/manual/en/function.substr_replace.php
  171. * Thanks to poster at http://ca.php.net/manual/en/function.substr-replace.php#90146
  172. */
  173. function substr_replace($string, $replacement, $start, $length = null) {
  174. if (function_exists('mb_substr_replace') === false) {
  175. function mb_substr_replace($string, $replacement, $start, $length = null) {
  176. if (extension_loaded('mbstring') === true) {
  177. $string_length = String::strlen($string);
  178. if ($start < 0) {
  179. $start = max(0, $string_length + $start);
  180. } else if ($start > $string_length) {
  181. $start = $string_length;
  182. }
  183. if ($length < 0) {
  184. $length = max(0, $string_length - $start + $length);
  185. } else if ((is_null($length) === true) || ($length > $string_length)) {
  186. $length = $string_length;
  187. }
  188. if (($start + $length) > $string_length) {
  189. $length = $string_length - $start;
  190. }
  191. return String::substr($string, 0, $start) . $replacement . String::substr($string, $start + $length, $string_length - $start - $length);
  192. }
  193. }
  194. return (is_null($length) === true) ? substr_replace($string, $replacement, $start) : substr_replace($string, $replacement, $start, $length);
  195. }
  196. }
  197. /**
  198. * @see http://ca.php.net/manual/en/function.strtolower.php
  199. */
  200. function strtolower($string) {
  201. if (defined('ENABLE_MBSTRING')) {
  202. require_once 'mbstring/core.php';
  203. } else {
  204. require_once 'utils/unicode.php';
  205. require_once 'native/core.php';
  206. }
  207. return utf8_strtolower($string);
  208. }
  209. /**
  210. * @see http://ca.php.net/manual/en/function.strtoupper.php
  211. */
  212. function strtoupper($string) {
  213. if (defined('ENABLE_MBSTRING')) {
  214. require_once 'mbstring/core.php';
  215. } else {
  216. require_once 'utils/unicode.php';
  217. require_once 'native/core.php';
  218. }
  219. return utf8_strtoupper($string);
  220. }
  221. /**
  222. * @see http://ca.php.net/manual/en/function.ucfirst.php
  223. */
  224. function ucfirst($string) {
  225. if (defined('ENABLE_MBSTRING')) {
  226. require_once 'mbstring/core.php';
  227. require_once 'ucfirst.php';
  228. } else {
  229. require_once 'utils/unicode.php';
  230. require_once 'native/core.php';
  231. require_once 'ucfirst.php';
  232. }
  233. return utf8_ucfirst($string);
  234. }
  235. /**
  236. * @see http://ca.php.net/manual/en/function.substr_count.php
  237. */
  238. function substr_count($haystack, $needle) {
  239. if (defined('ENABLE_MBSTRING')) {
  240. return mb_substr_count($haystack, $needle); // Requires PHP >= 4.3.0
  241. } else {
  242. return substr_count($haystack, $needle);
  243. }
  244. }
  245. /**
  246. * @see http://ca.php.net/manual/en/function.encode_mime_header.php
  247. */
  248. function encode_mime_header($string) {
  249. if (defined('ENABLE_MBSTRING')) {
  250. return mb_encode_mimeheader($string, mb_internal_encoding(), 'B', MAIL_EOL);
  251. } else {
  252. return $string;
  253. }
  254. }
  255. /**
  256. * @see http://ca.php.net/manual/en/function.mail.php
  257. */
  258. function mail($to, $subject, $message, $additional_headers = '', $additional_parameters = '') {
  259. // Cannot use mb_send_mail as it base64 encodes the whole body of the email,
  260. // making it useless for multipart emails
  261. if (empty($additional_parameters)) {
  262. return mail($to, $subject, $message, $additional_headers);
  263. } else {
  264. return mail($to, $subject, $message, $additional_headers, $additional_parameters);
  265. }
  266. }
  267. //
  268. // Wrappers for PCRE-compatible regular expression routines.
  269. // See the php.net documentation for usage.
  270. //
  271. /**
  272. * @see http://ca.php.net/manual/en/function.regexp_quote.php
  273. */
  274. function regexp_quote($string, $delimiter = '/') {
  275. return preg_quote($string, $delimiter);
  276. }
  277. /**
  278. * @see http://ca.php.net/manual/en/function.regexp_grep.php
  279. */
  280. function regexp_grep($pattern, $input) {
  281. if (PCRE_UTF8 && !String::utf8_compliant($input)) $input = String::utf8_bad_strip($input);
  282. return preg_grep($pattern . PCRE_UTF8, $input);
  283. }
  284. /**
  285. * @see http://ca.php.net/manual/en/function.regexp_match.php
  286. */
  287. function regexp_match($pattern, $subject) {
  288. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  289. return preg_match($pattern . PCRE_UTF8, $subject);
  290. }
  291. /**
  292. * @see http://ca.php.net/manual/en/function.regexp_match_get.php
  293. */
  294. function regexp_match_get($pattern, $subject, &$matches) {
  295. // NOTE: This function was created since PHP < 5.x does not support optional reference parameters
  296. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  297. return preg_match($pattern . PCRE_UTF8, $subject, $matches);
  298. }
  299. /**
  300. * @see http://ca.php.net/manual/en/function.regexp_match_all.php
  301. */
  302. function regexp_match_all($pattern, $subject, &$matches) {
  303. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  304. return preg_match_all($pattern . PCRE_UTF8, $subject, $matches);
  305. }
  306. /**
  307. * @see http://ca.php.net/manual/en/function.regexp_replace.php
  308. */
  309. function regexp_replace($pattern, $replacement, $subject, $limit = -1) {
  310. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  311. return preg_replace($pattern . PCRE_UTF8, $replacement, $subject, $limit);
  312. }
  313. /**
  314. * @see http://ca.php.net/manual/en/function.regexp_replace_callback.php
  315. */
  316. function regexp_replace_callback($pattern, $callback, $subject, $limit = -1) {
  317. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  318. return preg_replace_callback($pattern . PCRE_UTF8, $callback, $subject, $limit);
  319. }
  320. /**
  321. * @see http://ca.php.net/manual/en/function.regexp_split.php
  322. */
  323. function regexp_split($pattern, $subject, $limit = -1) {
  324. if (PCRE_UTF8 && !String::utf8_compliant($subject)) $subject = String::utf8_bad_strip($subject);
  325. return preg_split($pattern . PCRE_UTF8, $subject, $limit);
  326. }
  327. /**
  328. * @see http://ca.php.net/manual/en/function.mime_content_type.php
  329. */
  330. function mime_content_type($filename) {
  331. if (function_exists('mime_content_type')) {
  332. $result = mime_content_type($filename);
  333. // mime_content_type appears to return a charset
  334. // (erroneously?) in recent versions of PHP5
  335. if (($i = strpos($result, ';')) !== false) {
  336. $result = trim(substr($result, 0, $i));
  337. }
  338. return $result;
  339. } elseif (function_exists('finfo_open')) {
  340. $fi =& Registry::get('fileInfo', true, null);
  341. if ($fi === null) {
  342. $fi = finfo_open(FILEINFO_MIME, Config::getVar('finfo', 'mime_database_path'));
  343. }
  344. if ($fi !== false) {
  345. return strtok(finfo_file($fi, $filename), ' ;');
  346. }
  347. }
  348. // Fall back on an external "file" tool
  349. $f = escapeshellarg($filename);
  350. $result = trim(`file --brief --mime $f`);
  351. // Make sure we just return the mime type.
  352. if (($i = strpos($result, ';')) !== false) {
  353. $result = trim(substr($result, 0, $i));
  354. }
  355. return $result;
  356. }
  357. /**
  358. * Strip unsafe HTML from the input text. Covers XSS attacks like scripts,
  359. * onclick(...) attributes, javascript: urls, and special characters.
  360. * @param $input string input string
  361. * @return string
  362. */
  363. function stripUnsafeHtml($input) {
  364. // If possible, use the HTML purifier.
  365. if (defined('USE_HTML_PURIFIER')) {
  366. require_once('lib/pkp/lib/htmlpurifier/library/HTMLPurifier.path.php');
  367. require_once('HTMLPurifier.includes.php');
  368. static $purifier;
  369. if (!isset($purifier)) {
  370. $config = HTMLPurifier_Config::createDefault();
  371. $config->set('Core.Encoding', Config::getVar('i18n', 'client_charset'));
  372. $config->set('HTML.Doctype', 'XHTML 1.0 Transitional');
  373. // Transform the old allowed_html setting into
  374. // a form HTMLPurifier can use.
  375. $config->set('HTML.Allowed', preg_replace(
  376. '/<(\w+)[ ]?([^>]*)>[ ]?/',
  377. '${1}[${2}],',
  378. Config::getVar('security', 'allowed_html', DEFAULT_ALLOWED_HTML)
  379. ));
  380. $config->set('Cache.SerializerPath', 'cache');
  381. $purifier = new HTMLPurifier($config);
  382. }
  383. return $purifier->purify($input);
  384. }
  385. // Fall back on imperfect but PHP4-capable implementation.
  386. // Parts of this implementation were taken from Horde:
  387. // see http://cvs.horde.org/co.php/framework/MIME/MIME/Viewer/html.php.
  388. static $allowedHtml;
  389. if (!isset($allowedHtml)) {
  390. $allowedHtml = preg_replace(
  391. '/<(\w+)( [^>]+)*>/', // Strip out attr specs
  392. '<${1}> ',
  393. Config::getVar('security', 'allowed_html', DEFAULT_ALLOWED_HTML)
  394. );
  395. }
  396. $html = strip_tags($input, $allowedHtml);
  397. // Change space entities to space characters
  398. $html = preg_replace('/&#(x0*20|0*32);?/i', ' ', $html);
  399. // Remove non-printable characters
  400. $html = preg_replace('/&#x?0*([9A-D]|1[0-3]);/i', '&nbsp;', $html);
  401. $html = preg_replace('/&#x?0*[9A-D]([^0-9A-F]|$)/i', '&nbsp\\1', $html);
  402. $html = preg_replace('/&#0*(9|1[0-3])([^0-9]|$)/i', '&nbsp\\2', $html);
  403. // Remove overly long numeric entities
  404. $html = preg_replace('/&#x?0*[0-9A-F]{6,};?/i', '&nbsp;', $html);
  405. /* Get all attribute="javascript:foo()" tags. This is
  406. * essentially the regex /(=|url\()("?)[^>]* script:/ but
  407. * expanded to catch camouflage with spaces and entities. */
  408. $preg = '/((&#0*61;?|&#x0*3D;?|=)|'
  409. . '((u|&#0*85;?|&#x0*55;?|&#0*117;?|&#x0*75;?)\s*'
  410. . '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*'
  411. . '(l|&#0*76;?|&#x0*4c;?|&#0*108;?|&#x0*6c;?)\s*'
  412. . '(\()))\s*'
  413. . '(&#0*34;?|&#x0*22;?|"|&#0*39;?|&#x0*27;?|\')?'
  414. . '[^>]*\s*'
  415. . '(s|&#0*83;?|&#x0*53;?|&#0*115;?|&#x0*73;?)\s*'
  416. . '(c|&#0*67;?|&#x0*43;?|&#0*99;?|&#x0*63;?)\s*'
  417. . '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*'
  418. . '(i|&#0*73;?|&#x0*49;?|&#0*105;?|&#x0*69;?)\s*'
  419. . '(p|&#0*80;?|&#x0*50;?|&#0*112;?|&#x0*70;?)\s*'
  420. . '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*'
  421. . '(:|&#0*58;?|&#x0*3a;?)/i';
  422. $html = preg_replace($preg, '\1\8PKPCleaned', $html);
  423. /* Get all on<foo>="bar()". NEVER allow these. */
  424. $html = preg_replace('/([\s"\']+'
  425. . '(o|&#0*79;?|&#0*4f;?|&#0*111;?|&#0*6f;?)'
  426. . '(n|&#0*78;?|&#0*4e;?|&#0*110;?|&#0*6e;?)'
  427. . '\w+)\s*=/i', '\1PKPCleaned=', $html);
  428. $pattern = array(
  429. '|<([^>]*)&{.*}([^>]*)>|',
  430. '|<([^>]*)mocha:([^>]*)>|i',
  431. '|<([^>]*)binding:([^>]*)>|i'
  432. );
  433. $replace = array('<&{;}\3>', '<\1PKPCleaned:\2>', '<\1PKPCleaned:\2>');
  434. $html = preg_replace($pattern, $replace, $html);
  435. return $html;
  436. }
  437. /**
  438. * Convert limited HTML into a string.
  439. * @param $html string
  440. * @return string
  441. */
  442. function html2text($html) {
  443. $html = String::regexp_replace('/<[\/]?p>/', "\n", $html);
  444. $html = String::regexp_replace('/<li>/', '&bull; ', $html);
  445. $html = String::regexp_replace('/<\/li>/', "\n", $html);
  446. $html = String::regexp_replace('/<br[ ]?[\/]?>/', "\n", $html);
  447. $html = String::html2utf(strip_tags($html));
  448. return $html;
  449. }
  450. //
  451. // Wrappers for UTF-8 validation routines
  452. // See the phputf8 documentation for usage.
  453. //
  454. /**
  455. * Detect whether a string contains non-ascii multibyte sequences in the UTF-8 range
  456. * @param $str string input string
  457. * @return boolean
  458. */
  459. function utf8_is_valid($str) {
  460. require_once 'utils/validation.php';
  461. return utf8_is_valid($str);
  462. }
  463. /**
  464. * Tests whether a string complies as UTF-8; faster and less strict than utf8_is_valid
  465. * see lib/phputf8/utils/validation.php for more details
  466. * @param $str string input string
  467. * @return boolean
  468. */
  469. function utf8_compliant($str) {
  470. require_once 'utils/validation.php';
  471. return utf8_compliant($str);
  472. }
  473. /**
  474. * Locates the first bad byte in a UTF-8 string returning it's byte index in the string
  475. * @param $str string input string
  476. * @return string
  477. */
  478. function utf8_bad_find($str) {
  479. require_once 'utils/bad.php';
  480. return utf8_bad_find($str);
  481. }
  482. /**
  483. * Strips out any bad bytes from a UTF-8 string and returns the rest
  484. * @param $str string input string
  485. * @return string
  486. */
  487. function utf8_bad_strip($str) {
  488. require_once 'utils/bad.php';
  489. return utf8_bad_strip($str);
  490. }
  491. /**
  492. * Replace bad bytes with an alternative character - ASCII character
  493. * @param $str string input string
  494. * @param $replace string optional
  495. * @return string
  496. */
  497. function utf8_bad_replace($str, $replace = '?') {
  498. require_once 'utils/bad.php';
  499. return utf8_bad_replace($str, $replace);
  500. }
  501. /**
  502. * Replace bad bytes with an alternative character - ASCII character
  503. * @param $str string input string
  504. * @return string
  505. */
  506. function utf8_strip_ascii_ctrl($str) {
  507. require_once 'utils/ascii.php';
  508. return utf8_strip_ascii_ctrl($str);
  509. }
  510. /**
  511. * Normalize a string in an unknown (non-UTF8) encoding into a valid UTF-8 sequence
  512. * @param $str string input string
  513. * @return string
  514. */
  515. function utf8_normalize($str) {
  516. import('core.Transcoder');
  517. if (String::hasMBString()) {
  518. // NB: CP-1252 often segfaults; we've left it out here but it will detect as 'ISO-8859-1'
  519. $mb_encoding_order = 'UTF-8, UTF-7, ASCII, ISO-8859-1, EUC-JP, SJIS, eucJP-win, SJIS-win, JIS, ISO-2022-JP';
  520. if (checkPhpVersion('4.3.8')) {
  521. $detected_encoding = mb_detect_encoding($str, $mb_encoding_order, FALSE);
  522. } else {
  523. $detected_encoding = mb_detect_encoding($str, $mb_encoding_order);
  524. }
  525. } elseif (function_exists('iconv') && strlen(iconv('CP1252', 'UTF-8', $str)) != strlen(iconv('ISO-8859-1', 'UTF-8', $str))) {
  526. // use iconv to detect CP-1252, assuming default ISO-8859-1
  527. $detected_encoding = 'CP1252';
  528. } else {
  529. // assume ISO-8859-1, PHP default
  530. $detected_encoding = 'ISO-8859-1';
  531. }
  532. // transcode CP-1252/ISO-8859-1 into HTML entities; this works because CP-1252 is mapped onto ISO-8859-1
  533. if ('ISO-8859-1' == $detected_encoding || 'CP1252' == $detected_encoding) {
  534. $trans = new Transcoder('CP1252', 'HTML-ENTITIES');
  535. $str = $trans->trans($str);
  536. }
  537. // transcode from detected encoding to to UTF-8
  538. $trans = new Transcoder($detected_encoding, 'UTF-8');
  539. $str = $trans->trans($str);
  540. return $str;
  541. }
  542. /**
  543. * US-ASCII transliterations of Unicode text
  544. * @param $str string input string
  545. * @return string
  546. */
  547. function utf8_to_ascii($str) {
  548. require_once('utf8_to_ascii.php');
  549. return utf8_to_ascii($str);
  550. }
  551. /**
  552. * Returns the UTF-8 string corresponding to the unicode value
  553. * Does not require any multibyte PHP libraries
  554. * (from php.net, courtesy - romans@void.lv)
  555. * @param $num int
  556. * @return string
  557. */
  558. function code2utf ($num) {
  559. if ($num < 128) return chr($num);
  560. if ($num < 2048) return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  561. if ($num < 65536) return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  562. if ($num < 2097152) return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  563. return '';
  564. }
  565. /**
  566. * Convert UTF-8 encoded characters in a string to escaped HTML entities
  567. * This is a helper function for transcoding into HTML or XML for output
  568. * @param $str string input string
  569. * @return string
  570. */
  571. function utf2html ($str) {
  572. $ret = "";
  573. $max = strlen($str);
  574. $last = 0; // keeps the index of the last regular character
  575. for ($i=0; $i<$max; $i++) {
  576. $c = $str{$i};
  577. $c1 = ord($c);
  578. if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode
  579. $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
  580. $c1 &= 31; // remove the 3 bit two bytes prefix
  581. $c2 = ord($str{++$i}); // the next byte
  582. $c2 &= 63; // remove the 2 bit trailing byte prefix
  583. $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
  584. $c1 >>= 2; // c1 shifts 2 to the right
  585. $ret .= "&#" . ($c1 * 0x100 + $c2) . ";"; // this is the fastest string concatenation
  586. $last = $i+1;
  587. }
  588. elseif ($c1>>4 == 14) { // 1110 xxxx, 110 prefix for 3 bytes unicode
  589. $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
  590. $c2 = ord($str{++$i}); // the next byte
  591. $c3 = ord($str{++$i}); // the third byte
  592. $c1 &= 15; // remove the 4 bit three bytes prefix
  593. $c2 &= 63; // remove the 2 bit trailing byte prefix
  594. $c3 &= 63; // remove the 2 bit trailing byte prefix
  595. $c3 |= (($c2 & 3) << 6); // last 2 bits of c2 become first 2 of c3
  596. $c2 >>=2; //c2 shifts 2 to the right
  597. $c2 |= (($c1 & 15) << 4); // last 4 bits of c1 become first 4 of c2
  598. $c1 >>= 4; // c1 shifts 4 to the right
  599. $ret .= '&#' . (($c1 * 0x10000) + ($c2 * 0x100) + $c3) . ';'; // this is the fastest string concatenation
  600. $last = $i+1;
  601. }
  602. }
  603. $str=$ret . substr($str, $last, $i); // append the last batch of regular characters
  604. return $str;
  605. }
  606. /**
  607. * Convert numeric HTML entities in a string to UTF-8 encoded characters
  608. * This is a native alternative to the buggy html_entity_decode() using UTF8
  609. * @param $str string input string
  610. * @return string
  611. */
  612. function html2utf($str) {
  613. // convert named entities to numeric entities
  614. $str = strtr($str, String::getHTMLEntities());
  615. // use PCRE-aware replace function to replace numeric entities
  616. $str = String::regexp_replace('~&#x([0-9a-f]+);~ei', 'String::code2utf(hexdec("\\1"))', $str);
  617. $str = String::regexp_replace('~&#([0-9]+);~e', 'String::code2utf(\\1)', $str);
  618. return $str;
  619. }
  620. /**
  621. * Return an associative array of named->numeric HTML entities
  622. * Required to support HTML functions without objects in PHP4/PHP5
  623. * From php.net: function.get-html-translation-table.php
  624. * @return string
  625. */
  626. function getHTMLEntities () {
  627. // define the conversion table
  628. $html_entities = array(
  629. "&Aacute;" => "&#193;", "&aacute;" => "&#225;", "&Acirc;" => "&#194;",
  630. "&acirc;" => "&#226;", "&acute;" => "&#180;", "&AElig;" => "&#198;",
  631. "&aelig;" => "&#230;", "&Agrave;" => "&#192;", "&agrave;" => "&#224;",
  632. "&alefsym;" => "&#8501;","&Alpha;" => "&#913;", "&alpha;" => "&#945;",
  633. "&amp;" => "&#38;", "&and;" => "&#8743;", "&ang;" => "&#8736;",
  634. "&apos;" => "&#39;", "&Aring;" => "&#197;", "&aring;" => "&#229;",
  635. "&asymp;" => "&#8776;", "&Atilde;" => "&#195;", "&atilde;" => "&#227;",
  636. "&Auml;" => "&#196;", "&auml;" => "&#228;", "&bdquo;" => "&#8222;",
  637. "&Beta;" => "&#914;", "&beta;" => "&#946;", "&brvbar;" => "&#166;",
  638. "&bull;" => "&#8226;", "&cap;" => "&#8745;", "&Ccedil;" => "&#199;",
  639. "&ccedil;" => "&#231;", "&cedil;" => "&#184;", "&cent;" => "&#162;",
  640. "&Chi;" => "&#935;", "&chi;" => "&#967;", "&circ;" => "&#94;",
  641. "&clubs;" => "&#9827;", "&cong;" => "&#8773;", "&copy;" => "&#169;",
  642. "&crarr;" => "&#8629;", "&cup;" => "&#8746;", "&curren;" => "&#164;",
  643. "&dagger;" => "&#8224;","&Dagger;" => "&#8225;", "&darr;" => "&#8595;",
  644. "&dArr;" => "&#8659;", "&deg;" => "&#176;", "&Delta;" => "&#916;",
  645. "&delta;" => "&#948;", "&diams;" => "&#9830;", "&divide;" => "&#247;",
  646. "&Eacute;" => "&#201;", "&eacute;" => "&#233;", "&Ecirc;" => "&#202;",
  647. "&ecirc;" => "&#234;", "&Egrave;" => "&#200;", "&egrave;" => "&#232;",
  648. "&empty;" => "&#8709;", "&emsp;" => "&#8195;", "&ensp;" => "&#8194;",
  649. "&Epsilon;" => "&#917;","&epsilon;" => "&#949;","&equiv;" => "&#8801;",
  650. "&Eta;" => "&#919;", "&eta;" => "&#951;", "&ETH;" => "&#208;",
  651. "&eth;" => "&#240;", "&Euml;" => "&#203;", "&euml;" => "&#235;",
  652. "&euro;" => "&#8364;", "&exist;" => "&#8707;", "&fnof;" => "&#402;",
  653. "&forall;" => "&#8704;","&frac12;" => "&#189;", "&frac14;" => "&#188;",
  654. "&frac34;" => "&#190;", "&frasl;" => "&#8260;", "&Gamma;" => "&#915;",
  655. "&gamma;" => "&#947;", "&ge;" => "&#8805;", "&gt;" => "&#62;",
  656. "&harr;" => "&#8596;", "&hArr;" => "&#8660;", "&hearts;" => "&#9829;",
  657. "&hellip;" => "&#8230;","&Iacute;" => "&#205;", "&iacute;" => "&#237;",
  658. "&Icirc;" => "&#206;", "&icirc;" => "&#238;", "&iexcl;" => "&#161;",
  659. "&Igrave;" => "&#204;", "&igrave;" => "&#236;", "&image;" => "&#8465;",
  660. "&infin;" => "&#8734;", "&int;" => "&#8747;", "&Iota;" => "&#921;",
  661. "&iota;" => "&#953;", "&iquest;" => "&#191;", "&isin;" => "&#8712;",
  662. "&Iuml;" => "&#207;", "&iuml;" => "&#239;", "&Kappa;" => "&#922;",
  663. "&kappa;" => "&#954;", "&Lambda;" => "&#923;", "&lambda;" => "&#955;",
  664. "&lang;" => "&#9001;", "&laquo;" => "&#171;", "&larr;" => "&#8592;",
  665. "&lArr;" => "&#8656;", "&lceil;" => "&#8968;",
  666. "&ldquo;" => "&#8220;", "&le;" => "&#8804;", "&lfloor;" => "&#8970;",
  667. "&lowast;" => "&#8727;","&loz;" => "&#9674;", "&lrm;" => "&#8206;",
  668. "&lsaquo;" => "&#8249;","&lsquo;" => "&#8216;", "&lt;" => "&#60;",
  669. "&macr;" => "&#175;", "&mdash;" => "&#8212;", "&micro;" => "&#181;",
  670. "&middot;" => "&#183;", "&minus;" => "&#45;", "&Mu;" => "&#924;",
  671. "&mu;" => "&#956;", "&nabla;" => "&#8711;", "&nbsp;" => "&#160;",
  672. "&ndash;" => "&#8211;", "&ne;" => "&#8800;", "&ni;" => "&#8715;",
  673. "&not;" => "&#172;", "&notin;" => "&#8713;", "&nsub;" => "&#8836;",
  674. "&Ntilde;" => "&#209;", "&ntilde;" => "&#241;", "&Nu;" => "&#925;",
  675. "&nu;" => "&#957;", "&Oacute;" => "&#211;", "&oacute;" => "&#243;",
  676. "&Ocirc;" => "&#212;", "&ocirc;" => "&#244;", "&OElig;" => "&#338;",
  677. "&oelig;" => "&#339;", "&Ograve;" => "&#210;", "&ograve;" => "&#242;",
  678. "&oline;" => "&#8254;", "&Omega;" => "&#937;", "&omega;" => "&#969;",
  679. "&Omicron;" => "&#927;","&omicron;" => "&#959;","&oplus;" => "&#8853;",
  680. "&or;" => "&#8744;", "&ordf;" => "&#170;", "&ordm;" => "&#186;",
  681. "&Oslash;" => "&#216;", "&oslash;" => "&#248;", "&Otilde;" => "&#213;",
  682. "&otilde;" => "&#245;", "&otimes;" => "&#8855;","&Ouml;" => "&#214;",
  683. "&ouml;" => "&#246;", "&para;" => "&#182;", "&part;" => "&#8706;",
  684. "&permil;" => "&#8240;","&perp;" => "&#8869;", "&Phi;" => "&#934;",
  685. "&phi;" => "&#966;", "&Pi;" => "&#928;", "&pi;" => "&#960;",
  686. "&piv;" => "&#982;", "&plusmn;" => "&#177;", "&pound;" => "&#163;",
  687. "&prime;" => "&#8242;", "&Prime;" => "&#8243;", "&prod;" => "&#8719;",
  688. "&prop;" => "&#8733;", "&Psi;" => "&#936;", "&psi;" => "&#968;",
  689. "&quot;" => "&#34;", "&radic;" => "&#8730;", "&rang;" => "&#9002;",
  690. "&raquo;" => "&#187;", "&rarr;" => "&#8594;", "&rArr;" => "&#8658;",
  691. "&rceil;" => "&#8969;", "&rdquo;" => "&#8221;", "&real;" => "&#8476;",
  692. "&reg;" => "&#174;", "&rfloor;" => "&#8971;","&Rho;" => "&#929;",
  693. "&rho;" => "&#961;", "&rlm;" => "&#8207;", "&rsaquo;" => "&#8250;",
  694. "&rsquo;" => "&#8217;", "&sbquo;" => "&#8218;", "&Scaron;" => "&#352;",
  695. "&scaron;" => "&#353;", "&sdot;" => "&#8901;", "&sect;" => "&#167;",
  696. "&shy;" => "&#173;", "&Sigma;" => "&#931;", "&sigma;" => "&#963;",
  697. "&sigmaf;" => "&#962;", "&sim;" => "&#8764;", "&spades;" => "&#9824;",
  698. "&sub;" => "&#8834;", "&sube;" => "&#8838;", "&sum;" => "&#8721;",
  699. "&sup1;" => "&#185;", "&sup2;" => "&#178;", "&sup3;" => "&#179;",
  700. "&sup;" => "&#8835;", "&supe;" => "&#8839;", "&szlig;" => "&#223;",
  701. "&Tau;" => "&#932;", "&tau;" => "&#964;", "&there4;" => "&#8756;",
  702. "&Theta;" => "&#920;", "&theta;" => "&#952;", "&thetasym;" => "&#977;",
  703. "&thinsp;" => "&#8201;","&THORN;" => "&#222;", "&thorn;" => "&#254;",
  704. "&tilde;" => "&#126;", "&times;" => "&#215;", "&trade;" => "&#8482;",
  705. "&Uacute;" => "&#218;", "&uacute;" => "&#250;", "&uarr;" => "&#8593;",
  706. "&uArr;" => "&#8657;", "&Ucirc;" => "&#219;", "&ucirc;" => "&#251;",
  707. "&Ugrave;" => "&#217;", "&ugrave;" => "&#249;", "&uml;" => "&#168;",
  708. "&upsih;" => "&#978;", "&Upsilon;" => "&#933;","&upsilon;" => "&#965;",
  709. "&Uuml;" => "&#220;", "&uuml;" => "&#252;", "&weierp;" => "&#8472;",
  710. "&Xi;" => "&#926;", "&xi;" => "&#958;", "&Yacute;" => "&#221;",
  711. "&yacute;" => "&#253;", "&yen;" => "&#165;", "&yuml;" => "&#255;",
  712. "&Yuml;" => "&#376;", "&Zeta;" => "&#918;", "&zeta;" => "&#950;",
  713. "&zwj;" => "&#8205;", "&zwnj;" => "&#8204;"
  714. );
  715. return $html_entities;
  716. }
  717. /**
  718. * Wrapper around fputcsv for systems that may or may not support it
  719. * (i.e. PHP before 5.1.0); see PHP documentation for fputcsv.
  720. */
  721. function fputcsv(&$handle, $fields = array(), $delimiter = ',', $enclosure = '"') {
  722. // From PHP website, thanks to boefje at hotmail dot com
  723. if (function_exists('fputcsv')) {
  724. return fputcsv($handle, $fields, $delimiter, $enclosure);
  725. }
  726. $str = '';
  727. $escape_char = '\\';
  728. foreach ($fields as $value) {
  729. if ( strpos($value, $delimiter) !== false ||
  730. strpos($value, $enclosure) !== false ||
  731. strpos($value, "\n") !== false ||
  732. strpos($value, "\r") !== false ||
  733. strpos($value, "\t") !== false ||
  734. strpos($value, ' ') !== false
  735. ) {
  736. $str2 = $enclosure;
  737. $escaped = 0;
  738. $len = strlen($value);
  739. for ($i=0; $i<$len; $i++) {
  740. if ($value[$i] == $escape_char) $escaped = 1;
  741. elseif (!$escaped && $value[$i] == $enclosure) $str2 .= $enclosure;
  742. else $escaped = 0;
  743. $str2 .= $value[$i];
  744. }
  745. $str2 .= $enclosure;
  746. $str .= $str2 . $delimiter;
  747. } else {
  748. $str .= $value . $delimiter;
  749. }
  750. }
  751. $str = substr($str, 0, -1);
  752. $str .= "\n";
  753. return fwrite($handle, $str);
  754. }
  755. /**
  756. * Trim punctuation from a string
  757. * @param $string string input string
  758. * @return string the trimmed string
  759. */
  760. function trimPunctuation($string) {
  761. return trim($string, ' ,.;:!?&()[]\\/');
  762. }
  763. /**
  764. * Convert a string to proper title case
  765. * @param $title string
  766. * @return string
  767. */
  768. function titleCase($title) {
  769. $smallWords = array(
  770. 'of', 'a', 'the', 'and', 'an', 'or', 'nor', 'but', 'is', 'if', 'then',
  771. 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out',
  772. 'over', 'to', 'into', 'with'
  773. );
  774. $words = explode(' ', $title);
  775. foreach ($words as $key => $word) {
  776. if ($key == 0 or !in_array(self::strtolower($word), $smallWords)) {
  777. $words[$key] = ucfirst(self::strtolower($word));
  778. } else {
  779. $words[$key] = self::strtolower($word);
  780. }
  781. }
  782. $newTitle = implode(' ', $words);
  783. return $newTitle;
  784. }
  785. /**
  786. * Iterate over an array of delimiters and see whether
  787. * it exists in the given input string. If so, then use
  788. * it to explode the string into an array.
  789. * @param $delimiters array
  790. * @param $input string
  791. * @return array
  792. */
  793. function iterativeExplode($delimiters, $input) {
  794. // Run through the delimiters and try them out
  795. // one by one.
  796. foreach($delimiters as $delimiter) {
  797. if (strstr($input, $delimiter) !== false) {
  798. return explode($delimiter, $input);
  799. }
  800. }
  801. // If none of the delimiters works then return
  802. // the original string as an array.
  803. return (array($input));
  804. }
  805. /**
  806. * Transform "handler-class" to "HandlerClass"
  807. * and "my-op" to "myOp".
  808. * @param $string input string
  809. * @param $type which kind of camel case?
  810. * @return string the string in camel case
  811. */
  812. function camelize($string, $type = CAMEL_CASE_HEAD_UP) {
  813. assert($type == CAMEL_CASE_HEAD_UP || $type == CAMEL_CASE_HEAD_DOWN);
  814. // Transform "handler-class" to "HandlerClass" and "my-op" to "MyOp"
  815. $string = str_replace(' ', '', ucwords(str_replace('-', ' ', $string)));
  816. // Transform "MyOp" to "myOp"
  817. if ($type == CAMEL_CASE_HEAD_DOWN) {
  818. // lcfirst() is PHP>5.3, so use workaround for PHP4 compatibility
  819. $string = strtolower(substr($string, 0, 1)).substr($string, 1);
  820. }
  821. return $string;
  822. }
  823. /**
  824. * Transform "HandlerClass" to "handler-class"
  825. * and "myOp" to "my-op".
  826. * @param $string
  827. */
  828. function uncamelize($string) {
  829. assert(!empty($string));
  830. // Transform "myOp" to "MyOp"
  831. $string = ucfirst($string);
  832. // Insert hyphens between words and return the string in lowercase
  833. $words = array();
  834. String::regexp_match_all('/[A-Z][a-z0-9]*/', $string, $words);
  835. assert(isset($words[0]) && !empty($words[0]) && strlen(implode('', $words[0])) == strlen($string));
  836. return strtolower(implode('-', $words[0]));
  837. }
  838. }
  839. ?>