PageRenderTime 49ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/typo3/class.t3lib_cs.php

https://bitbucket.org/ceu/moodle_demo
PHP | 2097 lines | 1238 code | 265 blank | 594 comment | 336 complexity | 34dd01786172e1709351050111fe5d6e MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.0, LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the Typo3 project. The Typo3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. *
  17. * This script is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * This copyright notice MUST APPEAR in all copies of the script!
  23. ***************************************************************/
  24. /**
  25. * Class for conversion between charsets.
  26. *
  27. * Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $
  28. * Moodle $Id: class.t3lib_cs.php,v 1.7.14.2 2009/11/19 10:10:50 skodak Exp $
  29. *
  30. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  31. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  32. */
  33. /**
  34. * [CLASS/FUNCTION INDEX of SCRIPT]
  35. *
  36. *
  37. *
  38. * 136: class t3lib_cs
  39. * 488: function parse_charset($charset)
  40. * 507: function get_locale_charset($locale)
  41. *
  42. * SECTION: Charset Conversion functions
  43. * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  44. * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  45. * 617: function utf8_encode($str,$charset)
  46. * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
  47. * 706: function utf8_to_entities($str)
  48. * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  49. * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  50. * 823: function UnumberToChar($cbyte)
  51. * 868: function utf8CharToUnumber($str,$hex=0)
  52. *
  53. * SECTION: Init functions
  54. * 911: function initCharset($charset)
  55. * 973: function initUnicodeData($mode=null)
  56. * 1198: function initCaseFolding($charset)
  57. * 1260: function initToASCII($charset)
  58. *
  59. * SECTION: String operation functions
  60. * 1331: function substr($charset,$string,$start,$len=null)
  61. * 1384: function strlen($charset,$string)
  62. * 1414: function crop($charset,$string,$len,$crop='')
  63. * 1467: function strtrunc($charset,$string,$len)
  64. * 1501: function conv_case($charset,$string,$case)
  65. * 1527: function specCharsToASCII($charset,$string)
  66. *
  67. * SECTION: Internal string operation functions
  68. * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
  69. *
  70. * SECTION: Internal UTF-8 string operation functions
  71. * 1622: function utf8_substr($str,$start,$len=null)
  72. * 1655: function utf8_strlen($str)
  73. * 1676: function utf8_strtrunc($str,$len)
  74. * 1698: function utf8_strpos($haystack,$needle,$offset=0)
  75. * 1723: function utf8_strrpos($haystack,$needle)
  76. * 1745: function utf8_char2byte_pos($str,$pos)
  77. * 1786: function utf8_byte2char_pos($str,$pos)
  78. * 1809: function utf8_char_mapping($str,$mode,$opt='')
  79. *
  80. * SECTION: Internal EUC string operation functions
  81. * 1885: function euc_strtrunc($str,$len,$charset)
  82. * 1914: function euc_substr($str,$start,$charset,$len=null)
  83. * 1939: function euc_strlen($str,$charset)
  84. * 1966: function euc_char2byte_pos($str,$pos,$charset)
  85. * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
  86. *
  87. * TOTAL FUNCTIONS: 35
  88. * (This index is automatically created/updated by the extension "extdeveval")
  89. *
  90. */
  91. /**
  92. * Notes on UTF-8
  93. *
  94. * Functions working on UTF-8 strings:
  95. *
  96. * - strchr/strstr
  97. * - strrchr
  98. * - substr_count
  99. * - implode/explode/join
  100. *
  101. * Functions nearly working on UTF-8 strings:
  102. *
  103. * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  104. * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  105. * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  106. * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  107. *
  108. * Functions NOT working on UTF-8 strings:
  109. *
  110. * - str*cmp
  111. * - stristr
  112. * - stripos
  113. * - substr
  114. * - strrev
  115. * - ereg/eregi
  116. * - split/spliti
  117. * - preg_*
  118. * - ...
  119. *
  120. */
  121. /**
  122. * Class for conversion between charsets
  123. *
  124. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  125. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  126. * @package TYPO3
  127. * @subpackage t3lib
  128. */
  129. class t3lib_cs {
  130. var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
  131. // This is the array where parsed conversion tables are stored (cached)
  132. var $parsedCharsets=array();
  133. // An array where case folding data will be stored (cached)
  134. var $caseFolding=array();
  135. // An array where charset-to-ASCII mappings are stored (cached)
  136. var $toASCII=array();
  137. // This tells the converter which charsets has two bytes per char:
  138. var $twoByteSets=array(
  139. 'ucs-2'=>1, // 2-byte Unicode
  140. );
  141. // This tells the converter which charsets has four bytes per char:
  142. var $fourByteSets=array(
  143. 'ucs-4'=>1, // 4-byte Unicode
  144. 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  145. );
  146. // This tells the converter which charsets use a scheme like the Extended Unix Code:
  147. var $eucBasedSets=array(
  148. 'gb2312'=>1, // Chinese, simplified.
  149. 'big5'=>1, // Chinese, traditional.
  150. 'euc-kr'=>1, // Korean
  151. 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
  152. );
  153. // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
  154. // http://czyborra.com/charsets/iso8859.html
  155. var $synonyms=array(
  156. 'us' => 'ascii',
  157. 'us-ascii'=> 'ascii',
  158. 'cp819' => 'iso-8859-1',
  159. 'ibm819' => 'iso-8859-1',
  160. 'iso-ir-100' => 'iso-8859-1',
  161. 'iso-ir-109' => 'iso-8859-2',
  162. 'iso-ir-148' => 'iso-8859-9',
  163. 'iso-ir-199' => 'iso-8859-14',
  164. 'iso-ir-203' => 'iso-8859-15',
  165. 'csisolatin1' => 'iso-8859-1',
  166. 'csisolatin2' => 'iso-8859-2',
  167. 'csisolatin3' => 'iso-8859-3',
  168. 'csisolatin5' => 'iso-8859-9',
  169. 'csisolatin8' => 'iso-8859-14',
  170. 'csisolatin9' => 'iso-8859-15',
  171. 'csisolatingreek' => 'iso-8859-7',
  172. 'iso-celtic' => 'iso-8859-14',
  173. 'latin1' => 'iso-8859-1',
  174. 'latin2' => 'iso-8859-2',
  175. 'latin3' => 'iso-8859-3',
  176. 'latin5' => 'iso-8859-9',
  177. 'latin6' => 'iso-8859-10',
  178. 'latin8' => 'iso-8859-14',
  179. 'latin9' => 'iso-8859-15',
  180. 'l1' => 'iso-8859-1',
  181. 'l2' => 'iso-8859-2',
  182. 'l3' => 'iso-8859-3',
  183. 'l5' => 'iso-8859-9',
  184. 'l6' => 'iso-8859-10',
  185. 'l8' => 'iso-8859-14',
  186. 'l9' => 'iso-8859-15',
  187. 'cyrillic' => 'iso-8859-5',
  188. 'arabic' => 'iso-8859-6',
  189. 'tis-620' => 'iso-8859-11',
  190. 'win874' => 'windows-874',
  191. 'win1250' => 'windows-1250',
  192. 'win1251' => 'windows-1251',
  193. 'win1252' => 'windows-1252',
  194. 'win1253' => 'windows-1253',
  195. 'win1254' => 'windows-1254',
  196. 'win1255' => 'windows-1255',
  197. 'win1256' => 'windows-1256',
  198. 'win1257' => 'windows-1257',
  199. 'win1258' => 'windows-1258',
  200. 'cp1250' => 'windows-1250',
  201. 'cp1251' => 'windows-1251',
  202. 'cp1252' => 'windows-1252',
  203. 'ms-ee' => 'windows-1250',
  204. 'ms-ansi' => 'windows-1252',
  205. 'ms-greek' => 'windows-1253',
  206. 'ms-turk' => 'windows-1254',
  207. 'winbaltrim' => 'windows-1257',
  208. 'koi-8ru' => 'koi-8r',
  209. 'koi8r' => 'koi-8r',
  210. 'cp878' => 'koi-8r',
  211. 'mac' => 'macroman',
  212. 'macintosh' => 'macroman',
  213. 'euc-cn' => 'gb2312',
  214. 'x-euc-cn' => 'gb2312',
  215. 'euccn' => 'gb2312',
  216. 'cp936' => 'gb2312',
  217. 'big-5' => 'big5',
  218. 'cp950' => 'big5',
  219. 'eucjp' => 'euc-jp',
  220. 'sjis' => 'shift_jis',
  221. 'shift-jis' => 'shift_jis',
  222. 'cp932' => 'shift_jis',
  223. 'cp949' => 'euc-kr',
  224. 'utf7' => 'utf-7',
  225. 'utf8' => 'utf-8',
  226. 'utf16' => 'utf-16',
  227. 'utf32' => 'utf-32',
  228. 'utf8' => 'utf-8',
  229. 'ucs2' => 'ucs-2',
  230. 'ucs4' => 'ucs-4',
  231. );
  232. // mapping of iso-639:2 language codes to script names
  233. var $lang_to_script=array(
  234. // iso-639:2 language codes, see:
  235. // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
  236. // http://www.loc.gov/standards/iso639-2/langcodes.html
  237. // http://www.unicode.org/onlinedat/languages.html
  238. 'ar' => 'arabic',
  239. 'bg' => 'cyrillic', // Bulgarian
  240. 'bs' => 'east_european', // Bosnian
  241. 'cs' => 'east_european', // Czech
  242. 'da' => 'west_european', // Danish
  243. 'de' => 'west_european', // German
  244. 'es' => 'west_european', // Spanish
  245. 'et' => 'estonian',
  246. 'eo' => 'unicode', // Esperanto
  247. 'eu' => 'west_european', // Basque
  248. 'fa' => 'arabic', // Persian
  249. 'fi' => 'west_european', // Finish
  250. 'fo' => 'west_european', // Faroese
  251. 'fr' => 'west_european', // French
  252. 'gr' => 'greek',
  253. 'he' => 'hebrew', // Hebrew (since 1998)
  254. 'hi' => 'unicode', // Hindi
  255. 'hr' => 'east_european', // Croatian
  256. 'hu' => 'east_european', // Hungarian
  257. 'iw' => 'hebrew', // Hebrew (til 1998)
  258. 'is' => 'west_european', // Icelandic
  259. 'it' => 'west_european', // Italian
  260. 'ja' => 'japanese',
  261. 'kl' => 'west_european', // Greenlandic
  262. 'ko' => 'korean',
  263. 'lt' => 'lithuanian',
  264. 'lv' => 'west_european', // Latvian/Lettish
  265. 'nl' => 'west_european', // Dutch
  266. 'no' => 'west_european', // Norwegian
  267. 'pl' => 'east_european', // Polish
  268. 'pt' => 'west_european', // Portuguese
  269. 'ro' => 'east_european', // Romanian
  270. 'ru' => 'cyrillic', // Russian
  271. 'sk' => 'east_european', // Slovak
  272. 'sl' => 'east_european', // Slovenian
  273. 'sr' => 'cyrillic', // Serbian
  274. 'sv' => 'west_european', // Swedish
  275. 'th' => 'thai',
  276. 'uk' => 'cyrillic', // Ukranian
  277. 'vi' => 'vietnamese',
  278. 'zh' => 'chinese',
  279. // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
  280. // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
  281. 'ara' => 'arabic',
  282. 'bgr' => 'cyrillic', // Bulgarian
  283. 'cat' => 'west_european', // Catalan
  284. 'chs' => 'simpl_chinese',
  285. 'cht' => 'trad_chinese',
  286. 'csy' => 'east_european', // Czech
  287. 'dan' => 'west_european', // Danisch
  288. 'deu' => 'west_european', // German
  289. 'dea' => 'west_european', // German (Austrian)
  290. 'des' => 'west_european', // German (Swiss)
  291. 'ena' => 'west_european', // English (Australian)
  292. 'enc' => 'west_european', // English (Canadian)
  293. 'eng' => 'west_european', // English
  294. 'enz' => 'west_european', // English (New Zealand)
  295. 'enu' => 'west_european', // English (United States)
  296. 'euq' => 'west_european', // Basque
  297. 'fos' => 'west_european', // Faroese
  298. 'far' => 'arabic', // Persian
  299. 'fin' => 'west_european', // Finish
  300. 'fra' => 'west_european', // French
  301. 'frb' => 'west_european', // French (Belgian)
  302. 'frc' => 'west_european', // French (Canadian)
  303. 'frs' => 'west_european', // French (Swiss)
  304. 'ell' => 'greek',
  305. 'heb' => 'hebrew',
  306. 'hin' => 'unicode', // Hindi
  307. 'hun' => 'east_european', // Hungarian
  308. 'isl' => 'west_euorpean', // Icelandic
  309. 'ita' => 'west_european', // Italian
  310. 'its' => 'west_european', // Italian (Swiss)
  311. 'jpn' => 'japanese',
  312. 'kor' => 'korean',
  313. 'lth' => 'lithuanian',
  314. 'lvi' => 'west_european', // Latvian/Lettish
  315. 'msl' => 'west_european', // Malay
  316. 'nlb' => 'west_european', // Dutch (Belgian)
  317. 'nld' => 'west_european', // Dutch
  318. 'nor' => 'west_european', // Norwegian (bokmal)
  319. 'non' => 'west_european', // Norwegian (nynorsk)
  320. 'plk' => 'east_european', // Polish
  321. 'ptg' => 'west_european', // Portuguese
  322. 'ptb' => 'west_european', // Portuguese (Brazil)
  323. 'rom' => 'east_european', // Romanian
  324. 'rus' => 'cyrillic', // Russian
  325. 'slv' => 'east_european', // Slovenian
  326. 'sky' => 'east_european', // Slovak
  327. 'srl' => 'east_european', // Serbian (Latin)
  328. 'srb' => 'cyrillic', // Serbian (Cyrillic)
  329. 'esp' => 'west_european', // Spanish (trad. sort)
  330. 'esm' => 'west_european', // Spanish (Mexican)
  331. 'esn' => 'west_european', // Spanish (internat. sort)
  332. 'sve' => 'west_european', // Swedish
  333. 'tha' => 'thai',
  334. 'trk' => 'turkish',
  335. 'ukr' => 'cyrillic', // Ukrainian
  336. // English language names
  337. 'arabic' => 'arabic',
  338. 'basque' => 'west_european',
  339. 'bosnian' => 'east_european',
  340. 'bulgarian' => 'east_european',
  341. 'catalan' => 'west_european',
  342. 'croatian' => 'east_european',
  343. 'czech' => 'east_european',
  344. 'danish' => 'west_european',
  345. 'dutch' => 'west_european',
  346. 'english' => 'west_european',
  347. 'esperanto' => 'unicode',
  348. 'estonian' => 'estonian',
  349. 'faroese' => 'west_european',
  350. 'farsi' => 'arabic',
  351. 'finnish' => 'west_european',
  352. 'french' => 'west_european',
  353. 'galician' => 'west_european',
  354. 'german' => 'west_european',
  355. 'greek' => 'greek',
  356. 'greenlandic' => 'west_european',
  357. 'hebrew' => 'hebrew',
  358. 'hindi' => 'unicode',
  359. 'hungarian' => 'east_european',
  360. 'icelandic' => 'west_european',
  361. 'italian' => 'west_european',
  362. 'latvian' => 'west_european',
  363. 'lettish' => 'west_european',
  364. 'lithuanian' => 'lithuanian',
  365. 'malay' => 'west_european',
  366. 'norwegian' => 'west_european',
  367. 'persian' => 'arabic',
  368. 'polish' => 'east_european',
  369. 'portuguese' => 'west_european',
  370. 'russian' => 'cyrillic',
  371. 'romanian' => 'east_european',
  372. 'serbian' => 'cyrillic',
  373. 'slovak' => 'east_european',
  374. 'slovenian' => 'east_european',
  375. 'spanish' => 'west_european',
  376. 'svedish' => 'west_european',
  377. 'that' => 'thai',
  378. 'turkish' => 'turkish',
  379. 'ukrainian' => 'cyrillic',
  380. );
  381. // mapping of language (family) names to charsets on Unix
  382. var $script_to_charset_unix=array(
  383. 'west_european' => 'iso-8859-1',
  384. 'estonian' => 'iso-8859-1',
  385. 'east_european' => 'iso-8859-2',
  386. 'baltic' => 'iso-8859-4',
  387. 'cyrillic' => 'iso-8859-5',
  388. 'arabic' => 'iso-8859-6',
  389. 'greek' => 'iso-8859-7',
  390. 'hebrew' => 'iso-8859-8',
  391. 'turkish' => 'iso-8859-9',
  392. 'thai' => 'iso-8859-11', // = TIS-620
  393. 'lithuanian' => 'iso-8859-13',
  394. 'chinese' => 'gb2312', // = euc-cn
  395. 'japanese' => 'euc-jp',
  396. 'korean' => 'euc-kr',
  397. 'simpl_chinese' => 'gb2312',
  398. 'trad_chinese' => 'big5',
  399. 'vietnamese' => '',
  400. 'unicode' => 'utf-8',
  401. );
  402. // mapping of language (family) names to charsets on Windows
  403. var $script_to_charset_windows=array(
  404. 'east_european' => 'windows-1250',
  405. 'cyrillic' => 'windows-1251',
  406. 'west_european' => 'windows-1252',
  407. 'greek' => 'windows-1253',
  408. 'turkish' => 'windows-1254',
  409. 'hebrew' => 'windows-1255',
  410. 'arabic' => 'windows-1256',
  411. 'baltic' => 'windows-1257',
  412. 'estonian' => 'windows-1257',
  413. 'lithuanian' => 'windows-1257',
  414. 'vietnamese' => 'windows-1258',
  415. 'thai' => 'cp874',
  416. 'korean' => 'cp949',
  417. 'chinese' => 'gb2312',
  418. 'japanese' => 'shift_jis',
  419. 'simpl_chinese' => 'gb2312',
  420. 'trad_chinese' => 'big5',
  421. );
  422. // mapping of locale names to charsets
  423. var $locale_to_charset=array(
  424. 'japanese.euc' => 'euc-jp',
  425. 'ja_jp.ujis' => 'euc-jp',
  426. 'korean.euc' => 'euc-kr',
  427. 'sr@Latn' => 'iso-8859-2',
  428. 'zh_cn' => 'gb2312',
  429. 'zh_hk' => 'big5',
  430. 'zh_tw' => 'big5',
  431. );
  432. // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
  433. // Empty values means "iso-8859-1"
  434. var $charSetArray = array(
  435. 'dk' => '',
  436. 'de' => '',
  437. 'no' => '',
  438. 'it' => '',
  439. 'fr' => '',
  440. 'es' => '',
  441. 'nl' => '',
  442. 'cz' => 'windows-1250',
  443. 'pl' => 'iso-8859-2',
  444. 'si' => 'windows-1250',
  445. 'fi' => '',
  446. 'tr' => 'iso-8859-9',
  447. 'se' => '',
  448. 'pt' => '',
  449. 'ru' => 'windows-1251',
  450. 'ro' => 'iso-8859-2',
  451. 'ch' => 'gb2312',
  452. 'sk' => 'windows-1250',
  453. 'lt' => 'windows-1257',
  454. 'is' => 'utf-8',
  455. 'hr' => 'windows-1250',
  456. 'hu' => 'iso-8859-2',
  457. 'gl' => '',
  458. 'th' => 'iso-8859-11',
  459. 'gr' => 'iso-8859-7',
  460. 'hk' => 'big5',
  461. 'eu' => '',
  462. 'bg' => 'windows-1251',
  463. 'br' => '',
  464. 'et' => 'iso-8859-4',
  465. 'ar' => 'iso-8859-6',
  466. 'he' => 'utf-8',
  467. 'ua' => 'windows-1251',
  468. 'jp' => 'shift_jis',
  469. 'lv' => 'utf-8',
  470. 'vn' => 'utf-8',
  471. 'ca' => 'iso-8859-15',
  472. 'ba' => 'iso-8859-2',
  473. 'kr' => 'euc-kr',
  474. 'eo' => 'utf-8',
  475. 'my' => '',
  476. 'hi' => 'utf-8',
  477. 'fo' => 'utf-8',
  478. 'fa' => 'utf-8',
  479. 'sr' => 'utf-8'
  480. );
  481. // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
  482. // Missing keys means: same as Typo3
  483. var $isoArray = array(
  484. 'ba' => 'bs',
  485. 'br' => 'pt_BR',
  486. 'ch' => 'zh_CN',
  487. 'cz' => 'cs',
  488. 'dk' => 'da',
  489. 'si' => 'sl',
  490. 'se' => 'sv',
  491. 'gl' => 'kl',
  492. 'gr' => 'el',
  493. 'hk' => 'zh_HK',
  494. 'kr' => 'ko',
  495. 'ua' => 'uk',
  496. 'jp' => 'ja',
  497. 'vn' => 'vi',
  498. );
  499. /**
  500. * Normalize - changes input character set to lowercase letters.
  501. *
  502. * @param string Input charset
  503. * @return string Normalized charset
  504. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  505. */
  506. function parse_charset($charset) {
  507. $charset = strtolower($charset);
  508. if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
  509. return $charset;
  510. }
  511. /**
  512. * Get the charset of a locale.
  513. *
  514. * ln language
  515. * ln_CN language / country
  516. * ln_CN.cs language / country / charset
  517. * ln_CN.cs@mod language / country / charset / modifier
  518. *
  519. * @param string Locale string
  520. * @return string Charset resolved for locale string
  521. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  522. */
  523. function get_locale_charset($locale) {
  524. $locale = strtolower($locale);
  525. // exact locale specific charset?
  526. if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
  527. // get modifier
  528. list($locale,$modifier) = explode('@',$locale);
  529. // locale contains charset: use it
  530. list($locale,$charset) = explode('.',$locale);
  531. if ($charset) return $this->parse_charset($charset);
  532. // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
  533. if ($modifier == 'euro') return 'iso-8859-15';
  534. // get language
  535. list($language,$country) = explode('_',$locale);
  536. if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
  537. if (TYPO3_OS == 'WIN') {
  538. $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
  539. } else {
  540. $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
  541. }
  542. return $cs;
  543. }
  544. /********************************************
  545. *
  546. * Charset Conversion functions
  547. *
  548. ********************************************/
  549. /**
  550. * Convert from one charset to another charset.
  551. *
  552. * @param string Input string
  553. * @param string From charset (the current charset of the string)
  554. * @param string To charset (the output charset wanted)
  555. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  556. * @return string Converted string
  557. * @see convArray()
  558. */
  559. function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
  560. if ($fromCS==$toCS) return $str;
  561. // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
  562. if ($toCS=='utf-8' || !$useEntityForNoChar) {
  563. switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
  564. case 'mbstring':
  565. $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
  566. if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
  567. break;
  568. case 'iconv':
  569. $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str);
  570. if (false !== $conv_str) return $conv_str;
  571. break;
  572. case 'recode':
  573. $conv_str = recode_string($fromCS.'..'.$toCS,$str);
  574. if (false !== $conv_str) return $conv_str;
  575. break;
  576. }
  577. // fallback to TYPO3 conversion
  578. }
  579. if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
  580. if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
  581. return $str;
  582. }
  583. /**
  584. * Convert all elements in ARRAY from one charset to another charset.
  585. * NOTICE: Array is passed by reference!
  586. *
  587. * @param string Input array, possibly multidimensional
  588. * @param string From charset (the current charset of the string)
  589. * @param string To charset (the output charset wanted)
  590. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  591. * @return void
  592. * @see conv()
  593. */
  594. function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
  595. foreach($array as $key => $value) {
  596. if (is_array($array[$key])) {
  597. $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
  598. } else {
  599. $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
  600. }
  601. }
  602. }
  603. /**
  604. * Converts $str from $charset to UTF-8
  605. *
  606. * @param string String in local charset to convert to UTF-8
  607. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  608. * @return string Output string, converted to UTF-8
  609. */
  610. function utf8_encode($str,$charset) {
  611. if ($charset === 'utf-8') return $str;
  612. // Charset is case-insensitive.
  613. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  614. $strLen = strlen($str);
  615. $outStr='';
  616. for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
  617. $chr=substr($str,$a,1);
  618. $ord=ord($chr);
  619. if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
  620. $ord2 = ord($str{$a+1});
  621. $ord = $ord<<8 | $ord2; // assume big endian
  622. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  623. $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
  624. } else $outStr.=chr($this->noCharByteVal); // No char exists
  625. $a++;
  626. } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
  627. if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
  628. if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
  629. $a++;
  630. $ord2=ord(substr($str,$a,1));
  631. $ord = $ord*256+$ord2;
  632. }
  633. }
  634. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  635. $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
  636. } else $outStr.= chr($this->noCharByteVal); // No char exists
  637. } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  638. }
  639. return $outStr;
  640. }
  641. }
  642. /**
  643. * Converts $str from UTF-8 to $charset
  644. *
  645. * @param string String in UTF-8 to convert to local charset
  646. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  647. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  648. * @return string Output string, converted to local charset
  649. */
  650. function utf8_decode($str,$charset,$useEntityForNoChar=0) {
  651. // Charset is case-insensitive.
  652. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  653. $strLen = strlen($str);
  654. $outStr='';
  655. $buf='';
  656. for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
  657. $chr=substr($str,$a,1);
  658. $ord=ord($chr);
  659. if ($ord>127) { // This means multibyte! (first byte!)
  660. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  661. $buf=$chr; // Add first byte
  662. for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
  663. $ord = $ord << 1; // Shift it left and ...
  664. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  665. $a++; // Increase pointer...
  666. $buf.=substr($str,$a,1); // ... and add the next char.
  667. } else break;
  668. }
  669. if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
  670. $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
  671. if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
  672. $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
  673. } else $outStr.= chr($mByte);
  674. } elseif ($useEntityForNoChar) { // Create num entity:
  675. $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
  676. } else $outStr.=chr($this->noCharByteVal); // No char exists
  677. } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
  678. } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  679. }
  680. return $outStr;
  681. }
  682. }
  683. /**
  684. * Converts all chars > 127 to numeric entities.
  685. *
  686. * @param string Input string
  687. * @return string Output string
  688. */
  689. function utf8_to_entities($str) {
  690. $strLen = strlen($str);
  691. $outStr='';
  692. $buf='';
  693. for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
  694. $chr=substr($str,$a,1);
  695. $ord=ord($chr);
  696. if ($ord>127) { // This means multibyte! (first byte!)
  697. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  698. $buf=$chr; // Add first byte
  699. for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
  700. $ord = $ord << 1; // Shift it left and ...
  701. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  702. $a++; // Increase pointer...
  703. $buf.=substr($str,$a,1); // ... and add the next char.
  704. } else break;
  705. }
  706. $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
  707. } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
  708. } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  709. }
  710. return $outStr;
  711. }
  712. /**
  713. * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
  714. *
  715. * @param string Input string, UTF-8
  716. * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
  717. * @return string Output string
  718. */
  719. function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
  720. if ($alsoStdHtmlEnt) {
  721. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
  722. }
  723. $token = md5(microtime());
  724. $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
  725. foreach($parts as $k => $v) {
  726. if ($k%2) {
  727. if (substr($v,0,1)=='#') { // Dec or hex entities:
  728. if (substr($v,1,1)=='x') {
  729. $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
  730. } else {
  731. $parts[$k] = $this->UnumberToChar(substr($v,1));
  732. }
  733. } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
  734. $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
  735. } else { // No conversion:
  736. $parts[$k] ='&'.$v.';';
  737. }
  738. }
  739. }
  740. return implode('',$parts);
  741. }
  742. /**
  743. * Converts all chars in the input UTF-8 string into integer numbers returned in an array
  744. *
  745. * @param string Input string, UTF-8
  746. * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
  747. * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
  748. * @return array Output array with the char numbers
  749. */
  750. function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
  751. // If entities must be registered as well...:
  752. if ($convEntities) {
  753. $str = $this->entities_to_utf8($str,1);
  754. }
  755. // Do conversion:
  756. $strLen = strlen($str);
  757. $outArr=array();
  758. $buf='';
  759. for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
  760. $chr=substr($str,$a,1);
  761. $ord=ord($chr);
  762. if ($ord>127) { // This means multibyte! (first byte!)
  763. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  764. $buf=$chr; // Add first byte
  765. for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
  766. $ord = $ord << 1; // Shift it left and ...
  767. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  768. $a++; // Increase pointer...
  769. $buf.=substr($str,$a,1); // ... and add the next char.
  770. } else break;
  771. }
  772. $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
  773. } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
  774. } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  775. }
  776. return $outArr;
  777. }
  778. /**
  779. * Converts a UNICODE number to a UTF-8 multibyte character
  780. * Algorithm based on script found at From: http://czyborra.com/utf/
  781. * Unit-tested by Kasper
  782. *
  783. * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
  784. *
  785. * bytes | bits | representation
  786. * 1 | 7 | 0vvvvvvv
  787. * 2 | 11 | 110vvvvv 10vvvvvv
  788. * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
  789. * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  790. * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  791. * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  792. *
  793. * @param integer UNICODE integer
  794. * @return string UTF-8 multibyte character string
  795. * @see utf8CharToUnumber()
  796. */
  797. function UnumberToChar($cbyte) {
  798. $str='';
  799. if ($cbyte < 0x80) {
  800. $str.=chr($cbyte);
  801. } else if ($cbyte < 0x800) {
  802. $str.=chr(0xC0 | ($cbyte >> 6));
  803. $str.=chr(0x80 | ($cbyte & 0x3F));
  804. } else if ($cbyte < 0x10000) {
  805. $str.=chr(0xE0 | ($cbyte >> 12));
  806. $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
  807. $str.=chr(0x80 | ($cbyte & 0x3F));
  808. } else if ($cbyte < 0x200000) {
  809. $str.=chr(0xF0 | ($cbyte >> 18));
  810. $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
  811. $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
  812. $str.=chr(0x80 | ($cbyte & 0x3F));
  813. } else if ($cbyte < 0x4000000) {
  814. $str.=chr(0xF8 | ($cbyte >> 24));
  815. $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
  816. $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
  817. $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
  818. $str.=chr(0x80 | ($cbyte & 0x3F));
  819. } else if ($cbyte < 0x80000000) {
  820. $str.=chr(0xFC | ($cbyte >> 30));
  821. $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
  822. $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
  823. $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
  824. $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
  825. $str.=chr(0x80 | ($cbyte & 0x3F));
  826. } else { // Cannot express a 32-bit character in UTF-8
  827. $str .= chr($this->noCharByteVal);
  828. }
  829. return $str;
  830. }
  831. /**
  832. * Converts a UTF-8 Multibyte character to a UNICODE number
  833. * Unit-tested by Kasper
  834. *
  835. * @param string UTF-8 multibyte character string
  836. * @param boolean If set, then a hex. number is returned.
  837. * @return integer UNICODE integer
  838. * @see UnumberToChar()
  839. */
  840. function utf8CharToUnumber($str,$hex=0) {
  841. $ord=ord(substr($str,0,1)); // First char
  842. if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
  843. $binBuf='';
  844. for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
  845. $ord = $ord << 1; // Shift it left and ...
  846. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  847. $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
  848. } else break;
  849. }
  850. $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
  851. $int = bindec($binBuf);
  852. } else $int = $ord;
  853. return $hex ? 'x'.dechex($int) : $int;
  854. }
  855. /********************************************
  856. *
  857. * Init functions
  858. *
  859. ********************************************/
  860. /**
  861. * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
  862. * This function is automatically called by the conversion functions
  863. *
  864. * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
  865. *
  866. * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
  867. * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
  868. * @access private
  869. */
  870. function initCharset($charset) {
  871. // Only process if the charset is not yet loaded:
  872. if (!is_array($this->parsedCharsets[$charset])) {
  873. // Conversion table filename:
  874. $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
  875. // If the conversion table is found:
  876. if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
  877. // Cache file for charsets:
  878. // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
  879. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
  880. if ($cacheFile && @is_file($cacheFile)) {
  881. $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
  882. } else {
  883. // Parse conversion table into lines:
  884. $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
  885. // Initialize the internal variable holding the conv. table:
  886. $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
  887. // traverse the lines:
  888. $detectedType='';
  889. foreach($lines as $value) {
  890. if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
  891. // Detect type if not done yet: (Done on first real line)
  892. // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
  893. if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
  894. if ($detectedType=='ms-token') {
  895. list($hexbyte,$utf8) = split('=|:',$value,3);
  896. } elseif ($detectedType=='whitespaced') {
  897. $regA=array();
  898. ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
  899. $hexbyte = $regA[1];
  900. $utf8 = 'U+'.$regA[2];
  901. }
  902. $decval = hexdec(trim($hexbyte));
  903. if ($decval>127) {
  904. $utf8decval = hexdec(substr(trim($utf8),2));
  905. $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
  906. $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
  907. }
  908. }
  909. }
  910. if ($cacheFile) {
  911. t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
  912. }
  913. }
  914. return 2;
  915. } else return false;
  916. } else return 1;
  917. }
  918. /**
  919. * This function initializes all UTF-8 character data tables.
  920. *
  921. * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
  922. *
  923. * @param string Mode ("case", "ascii", ...)
  924. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  925. * @access private
  926. */
  927. function initUnicodeData($mode=null) {
  928. // cache files
  929. $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
  930. $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
  931. // Only process if the tables are not yet loaded
  932. switch($mode) {
  933. case 'case':
  934. if (is_array($this->caseFolding['utf-8'])) return 1;
  935. // Use cached version if possible
  936. if ($cacheFileCase && @is_file($cacheFileCase)) {
  937. $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
  938. return 2;
  939. }
  940. break;
  941. case 'ascii':
  942. if (is_array($this->toASCII['utf-8'])) return 1;
  943. // Use cached version if possible
  944. if ($cacheFileASCII && @is_file($cacheFileASCII)) {
  945. $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
  946. return 2;
  947. }
  948. break;
  949. }
  950. // process main Unicode data file
  951. $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
  952. if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
  953. $fh = fopen($unicodeDataFile,'rb');
  954. if (!$fh) return false;
  955. // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
  956. // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
  957. $this->caseFolding['utf-8'] = array();
  958. $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
  959. $utf8CaseFolding['toUpper'] = array();
  960. $utf8CaseFolding['toLower'] = array();
  961. $utf8CaseFolding['toTitle'] = array();
  962. $decomposition = array(); // array of temp. decompositions
  963. $mark = array(); // array of chars that are marks (eg. composing accents)
  964. $number = array(); // array of chars that are numbers (eg. digits)
  965. $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
  966. while (!feof($fh)) {
  967. $line = fgets($fh,4096);
  968. // has a lot of info
  969. list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
  970. $ord = hexdec($char);
  971. if ($ord > 0xFFFF) break; // only process the BMP
  972. $utf8_char = $this->UnumberToChar($ord);
  973. if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
  974. if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
  975. // store "title" only when different from "upper" (only a few)
  976. if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
  977. switch ($cat{0}) {
  978. case 'M': // mark (accent, umlaut, ...)
  979. $mark["U+$char"] = 1;
  980. break;
  981. case 'N': // numeric value
  982. if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
  983. }
  984. // accented Latin letters without "official" decomposition
  985. $match = array();
  986. if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
  987. $c = ord($match[2]);
  988. if ($match[1] == 'SMALL') $c += 32;
  989. $decomposition["U+$char"] = array(dechex($c));
  990. continue;
  991. }
  992. $match = array();
  993. if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
  994. switch($match[1]) {
  995. case '<circle>': // add parenthesis as circle replacement, eg (1)
  996. $match[2] = '0028 '.$match[2].' 0029';
  997. break;
  998. case '<square>': // add square brackets as square replacement, eg [1]
  999. $match[2] = '005B '.$match[2].' 005D';
  1000. break;
  1001. case '<compat>': // ignore multi char decompositions that start with a space
  1002. if (ereg('^0020 ',$match[2])) continue 2;
  1003. break;
  1004. // ignore Arabic and vertical layout presentation decomposition
  1005. case '<initial>':
  1006. case '<medial>':
  1007. case '<final>':
  1008. case '<isolated>':
  1009. case '<vertical>':
  1010. continue 2;
  1011. }
  1012. $decomposition["U+$char"] = split(' ',$match[2]);
  1013. }
  1014. }
  1015. fclose($fh);
  1016. // process additional Unicode data for casing (allow folded characters to expand into a sequence)
  1017. $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
  1018. if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
  1019. $fh = fopen($specialCasingFile,'rb');
  1020. if ($fh) {
  1021. while (!feof($fh)) {
  1022. $line = fgets($fh,4096);
  1023. if ($line{0} != '#' && trim($line) != '') {
  1024. list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
  1025. if ($cond == '' || $cond{0} == '#') {
  1026. $utf8_char = $this->UnumberToChar(hexdec($char));
  1027. if ($char != $lower) {
  1028. $arr = split(' ',$lower);
  1029. for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1030. $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
  1031. }
  1032. if ($char != $title && $title != $upper) {
  1033. $arr = split(' ',$title);
  1034. for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1035. $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
  1036. }
  1037. if ($char != $upper) {
  1038. $arr = split(' ',$upper);
  1039. for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1040. $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
  1041. }
  1042. }
  1043. }
  1044. }
  1045. fclose($fh);
  1046. }
  1047. }
  1048. // process custom decompositions
  1049. $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
  1050. if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
  1051. $fh = fopen($customTranslitFile,'rb');
  1052. if ($fh) {
  1053. while (!feof($fh)) {
  1054. $line = fgets($fh,4096);
  1055. if ($line{0} != '#' && trim($line) != '') {
  1056. list($char,$translit) = t3lib_div::trimExplode(';', $line);
  1057. if (!$translit) $omit["U+$char"] = 1;
  1058. $decomposition["U+$char"] = split(' ', $translit);
  1059. }
  1060. }
  1061. fclose($fh);
  1062. }
  1063. }
  1064. // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
  1065. foreach($decomposition as $from => $to) {
  1066. $code_decomp = array();
  1067. while ($code_value = array_shift($to)) {
  1068. if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
  1069. foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
  1070. array_unshift($to, $cv);
  1071. }
  1072. } elseif (!isset($mark["U+$code_value"])) { // remove mark
  1073. array_push($code_decomp, $code_value);
  1074. }
  1075. }
  1076. if (count($code_decomp) || isset($omit[$from])) {
  1077. $decomposition[$from] = $code_decomp;
  1078. } else {
  1079. unset($decomposition[$from]);
  1080. }
  1081. }
  1082. // create ascii only mapping
  1083. $this->toASCII['utf-8'] = array();
  1084. $ascii =& $this->toASCII['utf-8'];
  1085. foreach($decomposition as $from => $to) {
  1086. $code_decomp = array();
  1087. while ($code_value = array_shift($to)) {
  1088. $ord = hexdec($code_value);
  1089. if ($ord > 127)
  1090. continue 2; // skip decompositions containing non-ASCII chars
  1091. else
  1092. array_push($code_decomp,chr($ord));
  1093. }
  1094. $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
  1095. }
  1096. // add numeric decompositions
  1097. foreach($number as $from => $to) {
  1098. $utf8_char = $this->UnumberToChar(hexdec($from));
  1099. if (!isset($ascii[$utf8_char])) {
  1100. $ascii[$utf8_char] = $to;
  1101. }
  1102. }
  1103. if ($cacheFileCase) {
  1104. t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
  1105. }
  1106. if ($cacheFileASCII) {
  1107. t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
  1108. }
  1109. return 3;
  1110. }
  1111. /**
  1112. * This function initializes the folding table for a charset other than UTF-8.
  1113. * This function is automatically called by the case folding functions.
  1114. *
  1115. * @param string Charset for which to initialize case folding.
  1116. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1117. * @access private
  1118. */
  1119. function initCaseFolding($charset) {
  1120. // Only process if the case table is not yet loaded:
  1121. if (is_array($this->caseFolding[$charset])) return 1;
  1122. // Use cached version if possible
  1123. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
  1124. if ($cacheFile && @is_file($cacheFile)) {
  1125. $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1126. return 2;
  1127. }
  1128. // init UTF-8 conversion for this charset
  1129. if (!$this->initCharset($charset)) {
  1130. return false;
  1131. }
  1132. // UTF-8 case folding is used as the base conversion table
  1133. if (!$this->initUnicodeData('case')) {
  1134. return false;
  1135. }
  1136. $nochar = chr($this->noCharByteVal);
  1137. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1138. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1139. $c = $this->utf8_decode($utf8, $charset);
  1140. // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
  1141. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
  1142. if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
  1143. // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
  1144. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
  1145. if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
  1146. // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
  1147. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
  1148. if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
  1149. }
  1150. // add the ASCII case table
  1151. for ($i=ord('a'); $i<=ord('z'); $i++) {
  1152. $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
  1153. }
  1154. for ($i=ord('A'); $i<=ord('Z'); $i++) {
  1155. $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
  1156. }
  1157. if ($cacheFile) {
  1158. t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
  1159. }
  1160. return 3;
  1161. }
  1162. /**
  1163. * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
  1164. * This function is automatically called by the ASCII transliteration functions.
  1165. *
  1166. * @param string Charset for which to initialize conversion.
  1167. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1168. * @access private
  1169. */
  1170. function initToASCII($charset) {
  1171. // Only process if the case table is not yet loaded:
  1172. if (is_array($this->toASCII[$charset])) return 1;
  1173. // Use cached version if possible
  1174. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
  1175. if ($cacheFile && @is_file($cacheFile)) {
  1176. $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1177. return 2;
  1178. }
  1179. // init UTF-8 conversion for this charset
  1180. if (!$this->initCharset($charset)) {
  1181. return false;
  1182. }
  1183. // UTF-8/ASCII transliteration is used as the base conversion table
  1184. if (!$this->initUnicodeData('ascii')) {
  1185. return false;
  1186. }
  1187. $nochar = chr($this->noCharByteVal);
  1188. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1189. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1190. $c = $this->utf8_decode($utf8, $charset);
  1191. if (isset($this->toASCII['utf-8'][$utf8])) {
  1192. $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
  1193. }
  1194. }
  1195. if ($cacheFile) {
  1196. t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
  1197. }
  1198. return 3;
  1199. }
  1200. /********************************************
  1201. *
  1202. * String operation functions
  1203. *
  1204. ********************************************/
  1205. /**
  1206. * Returns a part of a string.
  1207. * Unit-tested by Kasper (single byte charsets only)
  1208. *
  1209. * @param string The character set
  1210. * @param string Character string
  1211. * @param integer Start position (character position)
  1212. * @param integer Length (in characters)
  1213. * @return string The substring
  1214. * @see substr(), mb_substr()
  1215. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1216. */
  1217. function substr($charset,$string,$start,$len=null) {
  1218. if ($len===0) return '';
  1219. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1220. // cannot omit $len, when specifying charset
  1221. if ($len==null) {
  1222. $enc = mb_internal_encoding(); // save internal encoding
  1223. mb_internal_encoding($charset);
  1224. $str = mb_substr($string,$start);
  1225. mb_internal_encoding($enc); // restore internal encoding
  1226. return $str;
  1227. }
  1228. else {
  1229. return mb_substr($string,$start,$len,$charset);
  1230. }
  1231. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1232. // cannot omit $len, when specifying charset
  1233. if ($len==null) {
  1234. $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
  1235. iconv_set_encoding('internal_encoding',$charset);
  1236. $str = iconv_substr($string,$start);
  1237. iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
  1238. return $str;
  1239. }
  1240. else {
  1241. return iconv_substr($string,$start,$len,$charset);
  1242. }
  1243. } elseif ($charset == 'utf-8') {
  1244. return $this->utf8_substr($string,$start,$len);
  1245. } elseif ($this->eucBasedSets[$charset]) {
  1246. return $this->euc_substr($string,$start,$charset,$len);
  1247. } elseif ($this->twoByteSets[$charset]) {
  1248. return substr($string,$start*2,$len*2);
  1249. } elseif ($this->fourByteSets[$charset]) {
  1250. return substr($string,$start*4,$len*4);
  1251. }
  1252. // treat everything else as single-byte encoding
  1253. return $len === NULL ? substr($string,$start) : substr($string,$start,

Large files files are truncated, but you can click here to view the full file