PageRenderTime 51ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/libs/verysimple/String/VerySimpleStringUtil.php

http://github.com/jasonhinkle/phreeze
PHP | 527 lines | 489 code | 13 blank | 25 comment | 0 complexity | 3f84b7e0b8a0590ac3dcf4bbec8079bb MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. <?php
  2. /** @package verysimple::String */
  3. /**
  4. * A set of utility functions for working with strings
  5. *
  6. * @package verysimple::String
  7. * @author Jason Hinkle
  8. * @copyright 1997-2008 VerySimple, Inc.
  9. * @license http://www.gnu.org/licenses/lgpl.html LGPL
  10. * @version 1.0
  11. */
  12. class VerySimpleStringUtil
  13. {
  14. /** @var the character set used when converting non ascii characters */
  15. static $DEFAULT_CHARACTER_SET = 'UTF-8';
  16. /** @var list of fancy/smart quote characters plus emdash w/ generic replacements */
  17. static $SMART_QUOTE_CHARS;
  18. /** @var list of xml reserved characters */
  19. static $XML_SPECIAL_CHARS;
  20. /** @var associative array containing the html translation for special characters with their numeric equivilant */
  21. static $HTML_ENTITIES_TABLE;
  22. /** @var common characters, especially on windows systems, that are technical not valid */
  23. static $INVALID_CODE_CHARS;
  24. /** @var characters used as control characters such as escape, backspace, etc */
  25. static $CONTROL_CODE_CHARS;
  26. /**
  27. * replace the first occurrance only within a string
  28. * @param string needle
  29. * @param string replacement
  30. * @param string haystack
  31. */
  32. static function ReplaceFirst($s,$r,$str)
  33. {
  34. $l = strlen($str);
  35. $a = strpos($str,$s);
  36. $b = $a + strlen($s);
  37. $temp = substr($str,0,$a) . $r . substr($str,$b,($l-$b));
  38. return $temp;
  39. }
  40. /**
  41. * VerySimpleStringUtil::InitStaticVars(); is called at the bottom of this file
  42. */
  43. static function InitStaticVars()
  44. {
  45. self::$HTML_ENTITIES_TABLE = array();
  46. foreach (get_html_translation_table(HTML_ENTITIES, ENT_QUOTES) as $char => $entity)
  47. {
  48. self::$HTML_ENTITIES_TABLE[$entity] = '&#' . ord($char) . ';';
  49. }
  50. self::$SMART_QUOTE_CHARS =
  51. array(
  52. "�" => "'",
  53. "�" => "'",
  54. "�" => "\"",
  55. "�" => "\"",
  56. chr(145) => "'",
  57. chr(146) => "'",
  58. chr(147) => "\"",
  59. chr(148) => "\"",
  60. chr(151) => "-"
  61. );
  62. self::$CONTROL_CODE_CHARS =
  63. array(
  64. chr(0) => "&#0;",
  65. chr(1) => "&#1;",
  66. chr(2) => "&#2;",
  67. chr(3) => "&#3;",
  68. chr(4) => "&#4;",
  69. chr(5) => "&#5;",
  70. chr(6) => "&#6;",
  71. chr(7) => "&#7;",
  72. chr(8) => "&#8;",
  73. chr(14) => "&#14;",
  74. chr(15) => "&#15;",
  75. chr(16) => "&#16;",
  76. chr(17) => "&#17;",
  77. chr(18) => "&#18;",
  78. chr(19) => "&#19;",
  79. chr(20) => "&#20;",
  80. chr(21) => "&#21;",
  81. chr(22) => "&#22;",
  82. chr(23) => "&#23;",
  83. chr(24) => "&#24;",
  84. chr(25) => "&#25;",
  85. chr(26) => "&#26;",
  86. chr(27) => "&#27;",
  87. chr(28) => "&#28;",
  88. chr(29) => "&#29;",
  89. chr(30) => "&#30;",
  90. chr(31) => "&#31;"
  91. );
  92. self::$INVALID_CODE_CHARS = array(
  93. chr(128) => '&#8364;',
  94. chr(130) => '&#8218;',
  95. chr(131) => '&#402;',
  96. chr(132) => '&#8222;',
  97. chr(133) => '&#8230;',
  98. chr(134) => '&#8224;',
  99. chr(135) => '&#8225;',
  100. chr(136) => '&#710;',
  101. chr(137) => '&#8240;',
  102. chr(138) => '&#352;',
  103. chr(139) => '&#8249;',
  104. chr(140) => '&#338;',
  105. chr(142) => '&#381;',
  106. chr(145) => '&#8216;',
  107. chr(146) => '&#8217;',
  108. chr(147) => '&#8220;',
  109. chr(148) => '&#8221;',
  110. chr(149) => '&#8226;',
  111. chr(150) => '&#8211;',
  112. chr(151) => '&#8212;',
  113. chr(152) => '&#732;',
  114. chr(153) => '&#8482;',
  115. chr(154) => '&#353;',
  116. chr(155) => '&#8250;',
  117. chr(156) => '&#339;',
  118. chr(158) => '&#382;',
  119. chr(159) => '&#376;');
  120. self::$XML_SPECIAL_CHARS = array(
  121. "&" =>"&amp;",
  122. "<" =>"&lt;"
  123. ,">" =>"&gt;"
  124. ,"\"" =>"&quot;"
  125. ,"'" =>"&apos;"
  126. );
  127. }
  128. /**
  129. * Takes the given text and converts any email address into mailto links,
  130. * returning HTML content.
  131. *
  132. * @param string $text
  133. * @param bool true to sanitize the text before parsing for display security
  134. * @return string HTML
  135. */
  136. static function ConvertEmailToMailTo($text,$sanitize = false)
  137. {
  138. if ($sanitize) $text = VerySimpleStringUtil::Sanitize($text);
  139. $regex = "/([a-z0-9_\-\.]+)". "@" . "([a-z0-9-]{1,64})" . "\." . "([a-z]{2,10})/i";
  140. return preg_replace($regex, '<a href="mailto:\\1@\\2.\\3">\\1@\\2.\\3</a>', $text);
  141. }
  142. /**
  143. * Takes the given text and converts any URLs into links,
  144. * returning HTML content.
  145. *
  146. * @param string $text
  147. * @param bool true to sanitize the text before parsing for display security
  148. * @return string HTML
  149. */
  150. static function ConvertUrlToLink($text,$sanitize = false)
  151. {
  152. if ($sanitize) $text = VerySimpleStringUtil::Sanitize($text);
  153. $regex = "/[[:alpha:]]+://[^<>[:space:]]+[[:alnum:]/]/i";
  154. return preg_replace($regex, '<a href=\"\\0\">\\0</a>', $text);
  155. }
  156. /**
  157. * Sanitize any text so that it can be safely displayed as HTML without
  158. * allowing XSS or other injection attacks
  159. * @param string $text
  160. * @return string
  161. */
  162. static function Sanitize($text)
  163. {
  164. return htmlspecialchars($text);
  165. }
  166. /**
  167. * @param string $string
  168. * @param bool $numericEncodingOnly set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
  169. * @param bool $encodeControlCharacters (only relevant if $numericEncodingOnly = false) false = wipe control chars. true = encode control characters (default false)
  170. * @return string
  171. */
  172. static function EncodeToHTML($string, $numericEncodingOnly = true, $encodeControlCharacters = false)
  173. {
  174. if (strlen($string) == 0) return "";
  175. $result = $numericEncodingOnly
  176. ? self::UTF8ToHtml($string)
  177. : self::UTFToNamedHTML($string, $encodeControlCharacters);
  178. return $result;
  179. }
  180. /**
  181. * Decode string that has been encoded using EncodeToHTML
  182. * used in combination with utf8_decode can be helpful
  183. * @TODO: warning, this function is BETA!
  184. *
  185. * @param string $string
  186. * @param destination character set (default = $DEFAULT_CHARACTER_SET (UTF-8))
  187. */
  188. static function DecodeFromHTML($string, $charset = null)
  189. {
  190. // this only gets named characters
  191. // return html_entity_decode($string);
  192. // this is a complex method that appears to be the reverse of UTF8ToHTML
  193. // taken from http://www.php.net/manual/en/function.html-entity-decode.php#68491
  194. // $string = self::ReplaceNonNumericEntities($string);
  195. // $string = preg_replace_callback('~&(#(x?))?([^;]+);~', 'self::html_entity_replace', $string);
  196. // return $string;
  197. // this way at least somebody could specify a character set. UTF-8 will work most of the time
  198. if ($charset == null) $charset = VerySimpleStringUtil::$DEFAULT_CHARACTER_SET;
  199. return mb_convert_encoding($string, $charset, 'HTML-ENTITIES');
  200. }
  201. /**
  202. * This HTML encodes special characters and returns an ascii safe version.
  203. * This function extends EncodeToHTML to additionally strip
  204. * out characters that may be disruptive when used in HTML or XML data
  205. *
  206. * @param string value to parse
  207. * @param bool $escapeQuotes true to additionally escape ENT_QUOTE characters <>&"' (default = true)
  208. * @param bool $numericEncodingOnly set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
  209. * @param bool $replaceSmartQuotes true to replace "smart quotes" with standard ascii ones, can be useful for stripping out windows-only codes (default = false)
  210. * @return string
  211. */
  212. static function EncodeSpecialCharacters($string, $escapeQuotes = true, $numericEncodingOnly = true, $replaceSmartQuotes = false)
  213. {
  214. if (strlen($string) == 0) return "";
  215. $result = $string;
  216. // do this first before encoding
  217. if ($replaceSmartQuotes) $result = self::ReplaceSmartQuotes($result);
  218. // this method does not double-encode, but replaces single-quote with a numeric entity
  219. if ($escapeQuotes) $result = htmlspecialchars($result, ENT_QUOTES, null, false);
  220. // this method double-encodes values but uses the special character entity for single quotes
  221. // if ($escapeQuotes) $result = self::ReplaceXMLSpecialChars($result);
  222. // for special chars we don't need to insist on numeric encoding only
  223. return self::EncodeToHTML($result,$numericEncodingOnly);
  224. }
  225. /**
  226. * Converts a string into a character array
  227. * @param string $string
  228. * @return array
  229. */
  230. static function GetCharArray($string)
  231. {
  232. return preg_split("//", $string, -1, PREG_SPLIT_NO_EMPTY);
  233. }
  234. /**
  235. * This replaces XML special characters with HTML encoding
  236. * @param string $string
  237. * @return string
  238. */
  239. static function ReplaceXMLSpecialChars($string)
  240. {
  241. return strtr($string,self::$XML_SPECIAL_CHARS);
  242. }
  243. /**
  244. * This replaces smart (fancy) quote characters with generic ascii versions
  245. * @param string $string
  246. * @return string
  247. */
  248. static function ReplaceSmartQuotes($string)
  249. {
  250. return strtr($string,self::$SMART_QUOTE_CHARS);
  251. }
  252. /**
  253. * This replaces control characters characters with generic ascii versions
  254. * @param string $string
  255. * @return string
  256. */
  257. static function ReplaceControlCodeChars($string)
  258. {
  259. return strtr($string,self::$CONTROL_CODE_CHARS);
  260. }
  261. /**
  262. * This replaces all non-numeric html entities with the numeric equivilant
  263. * @param string $string
  264. * @return string
  265. */
  266. static function ReplaceNonNumericEntities($string)
  267. {
  268. return strtr($string,self::$HTML_ENTITIES_TABLE);
  269. }
  270. /**
  271. * This replaces illegal ascii code values $INVALID_CODE_CHARS
  272. * @param string $string
  273. * @return string
  274. */
  275. static function ReplaceInvalidCodeChars($string)
  276. {
  277. return strtr($string,self::$INVALID_CODE_CHARS);
  278. }
  279. /**
  280. * This is The same as UTFToHTML except it utilizes htmlentities, which will return the Named
  281. * HTML code when possible (ie &pound; &sect;, etc). It is preferrable in all cases to use
  282. * UTFToHTML instead unless you absolutely have to have named entities
  283. *
  284. * @param string $string
  285. * @param bool $encodeControlCharacters false = wipe control chars. true = encode control characters (default false)
  286. * @return string
  287. */
  288. static function UTFToNamedHTML($string, $encodeControlCharacters = false)
  289. {
  290. $utf8 = $string;
  291. $result = '';
  292. for ($i = 0; $i < strlen($utf8); $i++) {
  293. $char = $utf8[$i];
  294. $ascii = ord($char);
  295. if ($ascii < 128) {
  296. // one-byte character
  297. $result .= $char;
  298. } else if ($ascii < 192) {
  299. // non-utf8 character or not a start byte
  300. $result .= ($encodeControlCharacters) ? htmlentities($char) : '';
  301. } else if ($ascii < 224) {
  302. // two-byte character
  303. $encoded = htmlentities(substr($utf8, $i, 2), ENT_QUOTES, 'UTF-8');
  304. // @hack if htmlentities didn't encode it, then we need to do a charset conversion
  305. if ($encoded != '' && substr($encoded,0,1) != '&') $encoded = mb_convert_encoding($encoded, 'HTML-ENTITIES', self::$DEFAULT_CHARACTER_SET);
  306. $result .= $encoded;
  307. $i++;
  308. } else if ($ascii < 240) {
  309. // three-byte character
  310. $ascii1 = ord($utf8[$i+1]);
  311. $ascii2 = ord($utf8[$i+2]);
  312. $unicode = (15 & $ascii) * 4096 +
  313. (63 & $ascii1) * 64 +
  314. (63 & $ascii2);
  315. $result .= "&#$unicode;" ;
  316. $i += 2;
  317. } else if ($ascii < 248) { // (TODO: should this be 245 or 248 ??)
  318. // four-byte character
  319. $ascii1 = ord($utf8[$i+1]);
  320. $ascii2 = ord($utf8[$i+2]);
  321. $ascii3 = ord($utf8[$i+3]);
  322. $unicode = (15 & $ascii) * 262144 +
  323. (63 & $ascii1) * 4096 +
  324. (63 & $ascii2) * 64 +
  325. (63 & $ascii3);
  326. $result .= "&#$unicode;";
  327. $i += 3;
  328. }
  329. }
  330. return $result;
  331. }
  332. /**
  333. * Converts UTF-8 character set into html encoded goodness
  334. *
  335. * @author montana
  336. * @link http://www.php.net/manual/en/function.htmlentities.php#92105
  337. * @param string $content
  338. */
  339. static function UTF8ToHTML($content="")
  340. {
  341. $contents = self::unicode_string_to_array($content);
  342. $swap = "";
  343. $iCount = count($contents);
  344. for ($o=0;$o<$iCount;$o++) {
  345. $contents[$o] = self::unicode_entity_replace($contents[$o]);
  346. $swap .= $contents[$o];
  347. }
  348. return mb_convert_encoding($swap,"UTF-8"); //not really necessary, but why not.
  349. }
  350. /**
  351. * takes a unicode string and turns it into an array
  352. * of UTF-8 characters
  353. *
  354. * @author adjwilli
  355. * @param string $string
  356. * @return array
  357. */
  358. static function unicode_string_to_array( $string )
  359. {
  360. $array = array();
  361. $strlen = mb_strlen($string);
  362. while ($strlen) {
  363. $array[] = mb_substr( $string, 0, 1, "UTF-8" );
  364. $string = mb_substr( $string, 1, $strlen, "UTF-8" );
  365. $strlen = mb_strlen( $string );
  366. }
  367. return $array;
  368. }
  369. /**
  370. * Uses scary binary math to replace a character with
  371. * it's html entity
  372. *
  373. * @author m. perez
  374. * @param char $c
  375. * @return string
  376. */
  377. static function unicode_entity_replace($c)
  378. {
  379. $h = ord($c{0});
  380. if ($h <= 0x7F) { // 127
  381. return $c;
  382. } else if ($h < 0xC2) { // 194
  383. return $c;
  384. }
  385. if ($h <= 0xDF) { // 0xDF = 223
  386. $h = ($h & 0x1F) << 6 | (ord($c{1}) & 0x3F); // 0x0F = 15, 0x1F = 31, 0x3F = 63
  387. $h = "&#" . $h . ";";
  388. return $h;
  389. } else if ($h <= 0xEF) { // 0xEF = 239
  390. $h = ($h & 0x0F) << 12 | (ord($c{1}) & 0x3F) << 6 | (ord($c{2}) & 0x3F);
  391. $h = "&#" . $h . ";";
  392. return $h;
  393. } else if ($h <= 0xF4) { // 0xF4 = 244 (TODO: should this be 244 or 247 ??)
  394. $h = ($h & 0x0F) << 18 | (ord($c{1}) & 0x3F) << 12 | (ord($c{2}) & 0x3F) << 6 | (ord($c{3}) & 0x3F);
  395. $h = "&#" . $h . ";";
  396. return $h;
  397. }
  398. }
  399. /**
  400. * Used for decoding entities that started as UTF-8
  401. * converts a character that is likely non ascii into the correct UTF-8 char value
  402. * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
  403. * @param $code
  404. */
  405. function chr_utf8($code)
  406. {
  407. if ($code < 0) return false;
  408. elseif ($code < 128) return chr($code);
  409. elseif ($code < 160) // Remove Windows Illegals Cars
  410. {
  411. if ($code==128) $code=8364;
  412. elseif ($code==129) $code=160; // not affected
  413. elseif ($code==130) $code=8218;
  414. elseif ($code==131) $code=402;
  415. elseif ($code==132) $code=8222;
  416. elseif ($code==133) $code=8230;
  417. elseif ($code==134) $code=8224;
  418. elseif ($code==135) $code=8225;
  419. elseif ($code==136) $code=710;
  420. elseif ($code==137) $code=8240;
  421. elseif ($code==138) $code=352;
  422. elseif ($code==139) $code=8249;
  423. elseif ($code==140) $code=338;
  424. elseif ($code==141) $code=160; // not affected
  425. elseif ($code==142) $code=381;
  426. elseif ($code==143) $code=160; // not affected
  427. elseif ($code==144) $code=160; // not affected
  428. elseif ($code==145) $code=8216;
  429. elseif ($code==146) $code=8217;
  430. elseif ($code==147) $code=8220;
  431. elseif ($code==148) $code=8221;
  432. elseif ($code==149) $code=8226;
  433. elseif ($code==150) $code=8211;
  434. elseif ($code==151) $code=8212;
  435. elseif ($code==152) $code=732;
  436. elseif ($code==153) $code=8482;
  437. elseif ($code==154) $code=353;
  438. elseif ($code==155) $code=8250;
  439. elseif ($code==156) $code=339;
  440. elseif ($code==157) $code=160; // not affected
  441. elseif ($code==158) $code=382;
  442. elseif ($code==159) $code=376;
  443. }
  444. if ($code < 2048) return chr(192 | ($code >> 6)) . chr(128 | ($code & 63));
  445. elseif ($code < 65536) return chr(224 | ($code >> 12)) . chr(128 | (($code >> 6) & 63)) . chr(128 | ($code & 63));
  446. else return chr(240 | ($code >> 18)) . chr(128 | (($code >> 12) & 63)) . chr(128 | (($code >> 6) & 63)) . chr(128 | ($code & 63));
  447. }
  448. /**
  449. * Callback for preg_replace_callback('~&(#(x?))?([^;]+);~', 'html_entity_replace', $str);
  450. * used internally by decode
  451. * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
  452. * @param array
  453. */
  454. function html_entity_replace($matches)
  455. {
  456. if ($matches[2])
  457. {
  458. return self::chr_utf8(hexdec($matches[3]));
  459. }
  460. elseif ($matches[1])
  461. {
  462. return self::chr_utf8($matches[3]);
  463. }
  464. elseif ($matches[3])
  465. {
  466. // return "((&" . $matches[3] . ";))";
  467. // return mb_convert_encoding('&'.$matches[3].';', 'UTF-8', 'HTML-ENTITIES');
  468. return html_entity_decode('&'.$matches[3].';');
  469. }
  470. return false;
  471. }
  472. }
  473. // this will be executed only once
  474. VerySimpleStringUtil::InitStaticVars();
  475. ?>