PageRenderTime 46ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/typo3/class.t3lib_cs.php

https://bitbucket.org/synergylearning/campusconnect
PHP | 2367 lines | 1573 code | 218 blank | 576 comment | 376 complexity | 87eb15c41b0d2fb4a78985187ca298ea MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, GPL-3.0, LGPL-2.1, Apache-2.0, BSD-3-Clause, AGPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the Typo3 project. The Typo3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. *
  17. * This script is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * This copyright notice MUST APPEAR in all copies of the script!
  23. ***************************************************************/
  24. /**
  25. * Class for conversion between charsets.
  26. *
  27. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  28. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  29. */
  30. /**
  31. * Notes on UTF-8
  32. *
  33. * Functions working on UTF-8 strings:
  34. *
  35. * - strchr/strstr
  36. * - strrchr
  37. * - substr_count
  38. * - implode/explode/join
  39. *
  40. * Functions nearly working on UTF-8 strings:
  41. *
  42. * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  43. * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  44. * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  45. * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  46. * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  47. *
  48. * Functions NOT working on UTF-8 strings:
  49. *
  50. * - str*cmp
  51. * - stristr
  52. * - stripos
  53. * - substr
  54. * - strrev
  55. * - split/spliti
  56. * - ...
  57. *
  58. */
  59. /**
  60. * Class for conversion between charsets
  61. *
  62. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  63. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  64. * @package TYPO3
  65. * @subpackage t3lib
  66. */
  67. class t3lib_cs {
  68. /**
  69. * @var t3lib_l10n_Locales
  70. */
  71. protected $locales;
  72. var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  73. // This is the array where parsed conversion tables are stored (cached)
  74. var $parsedCharsets = array();
  75. // An array where case folding data will be stored (cached)
  76. var $caseFolding = array();
  77. // An array where charset-to-ASCII mappings are stored (cached)
  78. var $toASCII = array();
  79. // This tells the converter which charsets has two bytes per char:
  80. var $twoByteSets = array(
  81. 'ucs-2' => 1, // 2-byte Unicode
  82. );
  83. // This tells the converter which charsets has four bytes per char:
  84. var $fourByteSets = array(
  85. 'ucs-4' => 1, // 4-byte Unicode
  86. 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  87. );
  88. // This tells the converter which charsets use a scheme like the Extended Unix Code:
  89. var $eucBasedSets = array(
  90. 'gb2312' => 1, // Chinese, simplified.
  91. 'big5' => 1, // Chinese, traditional.
  92. 'euc-kr' => 1, // Korean
  93. 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
  94. );
  95. // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
  96. // http://czyborra.com/charsets/iso8859.html
  97. var $synonyms = array(
  98. 'us' => 'ascii',
  99. 'us-ascii' => 'ascii',
  100. 'cp819' => 'iso-8859-1',
  101. 'ibm819' => 'iso-8859-1',
  102. 'iso-ir-100' => 'iso-8859-1',
  103. 'iso-ir-101' => 'iso-8859-2',
  104. 'iso-ir-109' => 'iso-8859-3',
  105. 'iso-ir-110' => 'iso-8859-4',
  106. 'iso-ir-144' => 'iso-8859-5',
  107. 'iso-ir-127' => 'iso-8859-6',
  108. 'iso-ir-126' => 'iso-8859-7',
  109. 'iso-ir-138' => 'iso-8859-8',
  110. 'iso-ir-148' => 'iso-8859-9',
  111. 'iso-ir-157' => 'iso-8859-10',
  112. 'iso-ir-179' => 'iso-8859-13',
  113. 'iso-ir-199' => 'iso-8859-14',
  114. 'iso-ir-203' => 'iso-8859-15',
  115. 'csisolatin1' => 'iso-8859-1',
  116. 'csisolatin2' => 'iso-8859-2',
  117. 'csisolatin3' => 'iso-8859-3',
  118. 'csisolatin5' => 'iso-8859-9',
  119. 'csisolatin8' => 'iso-8859-14',
  120. 'csisolatin9' => 'iso-8859-15',
  121. 'csisolatingreek' => 'iso-8859-7',
  122. 'iso-celtic' => 'iso-8859-14',
  123. 'latin1' => 'iso-8859-1',
  124. 'latin2' => 'iso-8859-2',
  125. 'latin3' => 'iso-8859-3',
  126. 'latin5' => 'iso-8859-9',
  127. 'latin6' => 'iso-8859-10',
  128. 'latin8' => 'iso-8859-14',
  129. 'latin9' => 'iso-8859-15',
  130. 'l1' => 'iso-8859-1',
  131. 'l2' => 'iso-8859-2',
  132. 'l3' => 'iso-8859-3',
  133. 'l5' => 'iso-8859-9',
  134. 'l6' => 'iso-8859-10',
  135. 'l8' => 'iso-8859-14',
  136. 'l9' => 'iso-8859-15',
  137. 'cyrillic' => 'iso-8859-5',
  138. 'arabic' => 'iso-8859-6',
  139. 'tis-620' => 'iso-8859-11',
  140. 'win874' => 'windows-874',
  141. 'win1250' => 'windows-1250',
  142. 'win1251' => 'windows-1251',
  143. 'win1252' => 'windows-1252',
  144. 'win1253' => 'windows-1253',
  145. 'win1254' => 'windows-1254',
  146. 'win1255' => 'windows-1255',
  147. 'win1256' => 'windows-1256',
  148. 'win1257' => 'windows-1257',
  149. 'win1258' => 'windows-1258',
  150. 'cp1250' => 'windows-1250',
  151. 'cp1251' => 'windows-1251',
  152. 'cp1252' => 'windows-1252',
  153. 'ms-ee' => 'windows-1250',
  154. 'ms-ansi' => 'windows-1252',
  155. 'ms-greek' => 'windows-1253',
  156. 'ms-turk' => 'windows-1254',
  157. 'winbaltrim' => 'windows-1257',
  158. 'koi-8ru' => 'koi-8r',
  159. 'koi8r' => 'koi-8r',
  160. 'cp878' => 'koi-8r',
  161. 'mac' => 'macroman',
  162. 'macintosh' => 'macroman',
  163. 'euc-cn' => 'gb2312',
  164. 'x-euc-cn' => 'gb2312',
  165. 'euccn' => 'gb2312',
  166. 'cp936' => 'gb2312',
  167. 'big-5' => 'big5',
  168. 'cp950' => 'big5',
  169. 'eucjp' => 'euc-jp',
  170. 'sjis' => 'shift_jis',
  171. 'shift-jis' => 'shift_jis',
  172. 'cp932' => 'shift_jis',
  173. 'cp949' => 'euc-kr',
  174. 'utf7' => 'utf-7',
  175. 'utf8' => 'utf-8',
  176. 'utf16' => 'utf-16',
  177. 'utf32' => 'utf-32',
  178. 'utf8' => 'utf-8',
  179. 'ucs2' => 'ucs-2',
  180. 'ucs4' => 'ucs-4',
  181. );
  182. // mapping of iso-639-1 language codes to script names
  183. var $lang_to_script = array(
  184. // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
  185. 'af' => 'west_european', //Afrikaans
  186. 'ar' => 'arabic',
  187. 'bg' => 'cyrillic', // Bulgarian
  188. 'bs' => 'east_european', // Bosnian
  189. 'cs' => 'east_european', // Czech
  190. 'da' => 'west_european', // Danish
  191. 'de' => 'west_european', // German
  192. 'es' => 'west_european', // Spanish
  193. 'et' => 'estonian',
  194. 'eo' => 'unicode', // Esperanto
  195. 'eu' => 'west_european', // Basque
  196. 'fa' => 'arabic', // Persian
  197. 'fi' => 'west_european', // Finish
  198. 'fo' => 'west_european', // Faroese
  199. 'fr' => 'west_european', // French
  200. 'ga' => 'west_european', // Irish
  201. 'gl' => 'west_european', // Galician
  202. 'gr' => 'greek',
  203. 'he' => 'hebrew', // Hebrew (since 1998)
  204. 'hi' => 'unicode', // Hindi
  205. 'hr' => 'east_european', // Croatian
  206. 'hu' => 'east_european', // Hungarian
  207. 'iw' => 'hebrew', // Hebrew (til 1998)
  208. 'is' => 'west_european', // Icelandic
  209. 'it' => 'west_european', // Italian
  210. 'ja' => 'japanese',
  211. 'ka' => 'unicode', // Georgian
  212. 'kl' => 'west_european', // Greenlandic
  213. 'km' => 'unicode', // Khmer
  214. 'ko' => 'korean',
  215. 'lt' => 'lithuanian',
  216. 'lv' => 'west_european', // Latvian/Lettish
  217. 'nl' => 'west_european', // Dutch
  218. 'no' => 'west_european', // Norwegian
  219. 'nb' => 'west_european', // Norwegian Bokmal
  220. 'nn' => 'west_european', // Norwegian Nynorsk
  221. 'pl' => 'east_european', // Polish
  222. 'pt' => 'west_european', // Portuguese
  223. 'ro' => 'east_european', // Romanian
  224. 'ru' => 'cyrillic', // Russian
  225. 'sk' => 'east_european', // Slovak
  226. 'sl' => 'east_european', // Slovenian
  227. 'sr' => 'cyrillic', // Serbian
  228. 'sv' => 'west_european', // Swedish
  229. 'sq' => 'albanian', // Albanian
  230. 'th' => 'thai',
  231. 'uk' => 'cyrillic', // Ukranian
  232. 'vi' => 'vietnamese',
  233. 'zh' => 'chinese',
  234. // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
  235. // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
  236. 'afk'=> 'west_european', // Afrikaans
  237. 'ara' => 'arabic',
  238. 'bgr' => 'cyrillic', // Bulgarian
  239. 'cat' => 'west_european', // Catalan
  240. 'chs' => 'simpl_chinese',
  241. 'cht' => 'trad_chinese',
  242. 'csy' => 'east_european', // Czech
  243. 'dan' => 'west_european', // Danisch
  244. 'deu' => 'west_european', // German
  245. 'dea' => 'west_european', // German (Austrian)
  246. 'des' => 'west_european', // German (Swiss)
  247. 'ena' => 'west_european', // English (Australian)
  248. 'enc' => 'west_european', // English (Canadian)
  249. 'eng' => 'west_european', // English
  250. 'enz' => 'west_european', // English (New Zealand)
  251. 'enu' => 'west_european', // English (United States)
  252. 'euq' => 'west_european', // Basque
  253. 'fos' => 'west_european', // Faroese
  254. 'far' => 'arabic', // Persian
  255. 'fin' => 'west_european', // Finish
  256. 'fra' => 'west_european', // French
  257. 'frb' => 'west_european', // French (Belgian)
  258. 'frc' => 'west_european', // French (Canadian)
  259. 'frs' => 'west_european', // French (Swiss)
  260. 'geo' => 'unicode', // Georgian
  261. 'glg' => 'west_european', // Galician
  262. 'ell' => 'greek',
  263. 'heb' => 'hebrew',
  264. 'hin' => 'unicode', // Hindi
  265. 'hun' => 'east_european', // Hungarian
  266. 'isl' => 'west_euorpean', // Icelandic
  267. 'ita' => 'west_european', // Italian
  268. 'its' => 'west_european', // Italian (Swiss)
  269. 'jpn' => 'japanese',
  270. 'khm' => 'unicode', // Khmer
  271. 'kor' => 'korean',
  272. 'lth' => 'lithuanian',
  273. 'lvi' => 'west_european', // Latvian/Lettish
  274. 'msl' => 'west_european', // Malay
  275. 'nlb' => 'west_european', // Dutch (Belgian)
  276. 'nld' => 'west_european', // Dutch
  277. 'nor' => 'west_european', // Norwegian (bokmal)
  278. 'non' => 'west_european', // Norwegian (nynorsk)
  279. 'plk' => 'east_european', // Polish
  280. 'ptg' => 'west_european', // Portuguese
  281. 'ptb' => 'west_european', // Portuguese (Brazil)
  282. 'rom' => 'east_european', // Romanian
  283. 'rus' => 'cyrillic', // Russian
  284. 'slv' => 'east_european', // Slovenian
  285. 'sky' => 'east_european', // Slovak
  286. 'srl' => 'east_european', // Serbian (Latin)
  287. 'srb' => 'cyrillic', // Serbian (Cyrillic)
  288. 'esp' => 'west_european', // Spanish (trad. sort)
  289. 'esm' => 'west_european', // Spanish (Mexican)
  290. 'esn' => 'west_european', // Spanish (internat. sort)
  291. 'sve' => 'west_european', // Swedish
  292. 'sqi' => 'albanian', // Albanian
  293. 'tha' => 'thai',
  294. 'trk' => 'turkish',
  295. 'ukr' => 'cyrillic', // Ukrainian
  296. // English language names
  297. 'afrikaans' => 'west_european',
  298. 'albanian' => 'albanian',
  299. 'arabic' => 'arabic',
  300. 'basque' => 'west_european',
  301. 'bosnian' => 'east_european',
  302. 'bulgarian' => 'east_european',
  303. 'catalan' => 'west_european',
  304. 'croatian' => 'east_european',
  305. 'czech' => 'east_european',
  306. 'danish' => 'west_european',
  307. 'dutch' => 'west_european',
  308. 'english' => 'west_european',
  309. 'esperanto' => 'unicode',
  310. 'estonian' => 'estonian',
  311. 'faroese' => 'west_european',
  312. 'farsi' => 'arabic',
  313. 'finnish' => 'west_european',
  314. 'french' => 'west_european',
  315. 'galician' => 'west_european',
  316. 'georgian' => 'unicode',
  317. 'german' => 'west_european',
  318. 'greek' => 'greek',
  319. 'greenlandic' => 'west_european',
  320. 'hebrew' => 'hebrew',
  321. 'hindi' => 'unicode',
  322. 'hungarian' => 'east_european',
  323. 'icelandic' => 'west_european',
  324. 'italian' => 'west_european',
  325. 'khmer' => 'unicode',
  326. 'latvian' => 'west_european',
  327. 'lettish' => 'west_european',
  328. 'lithuanian' => 'lithuanian',
  329. 'malay' => 'west_european',
  330. 'norwegian' => 'west_european',
  331. 'persian' => 'arabic',
  332. 'polish' => 'east_european',
  333. 'portuguese' => 'west_european',
  334. 'russian' => 'cyrillic',
  335. 'romanian' => 'east_european',
  336. 'serbian' => 'cyrillic',
  337. 'slovak' => 'east_european',
  338. 'slovenian' => 'east_european',
  339. 'spanish' => 'west_european',
  340. 'svedish' => 'west_european',
  341. 'that' => 'thai',
  342. 'turkish' => 'turkish',
  343. 'ukrainian' => 'cyrillic',
  344. );
  345. // mapping of language (family) names to charsets on Unix
  346. var $script_to_charset_unix = array(
  347. 'west_european' => 'iso-8859-1',
  348. 'estonian' => 'iso-8859-1',
  349. 'east_european' => 'iso-8859-2',
  350. 'baltic' => 'iso-8859-4',
  351. 'cyrillic' => 'iso-8859-5',
  352. 'arabic' => 'iso-8859-6',
  353. 'greek' => 'iso-8859-7',
  354. 'hebrew' => 'iso-8859-8',
  355. 'turkish' => 'iso-8859-9',
  356. 'thai' => 'iso-8859-11', // = TIS-620
  357. 'lithuanian' => 'iso-8859-13',
  358. 'chinese' => 'gb2312', // = euc-cn
  359. 'japanese' => 'euc-jp',
  360. 'korean' => 'euc-kr',
  361. 'simpl_chinese' => 'gb2312',
  362. 'trad_chinese' => 'big5',
  363. 'vietnamese' => '',
  364. 'unicode' => 'utf-8',
  365. 'albanian' => 'utf-8'
  366. );
  367. // mapping of language (family) names to charsets on Windows
  368. var $script_to_charset_windows = array(
  369. 'east_european' => 'windows-1250',
  370. 'cyrillic' => 'windows-1251',
  371. 'west_european' => 'windows-1252',
  372. 'greek' => 'windows-1253',
  373. 'turkish' => 'windows-1254',
  374. 'hebrew' => 'windows-1255',
  375. 'arabic' => 'windows-1256',
  376. 'baltic' => 'windows-1257',
  377. 'estonian' => 'windows-1257',
  378. 'lithuanian' => 'windows-1257',
  379. 'vietnamese' => 'windows-1258',
  380. 'thai' => 'cp874',
  381. 'korean' => 'cp949',
  382. 'chinese' => 'gb2312',
  383. 'japanese' => 'shift_jis',
  384. 'simpl_chinese' => 'gb2312',
  385. 'trad_chinese' => 'big5',
  386. 'albanian' => 'windows-1250',
  387. 'unicode' => 'utf-8'
  388. );
  389. // mapping of locale names to charsets
  390. var $locale_to_charset = array(
  391. 'japanese.euc' => 'euc-jp',
  392. 'ja_jp.ujis' => 'euc-jp',
  393. 'korean.euc' => 'euc-kr',
  394. 'sr@Latn' => 'iso-8859-2',
  395. 'zh_cn' => 'gb2312',
  396. 'zh_hk' => 'big5',
  397. 'zh_tw' => 'big5',
  398. );
  399. // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
  400. // Empty values means "iso-8859-1"
  401. var $charSetArray = array(
  402. 'af' => '',
  403. 'ar' => 'iso-8859-6',
  404. 'ba' => 'iso-8859-2',
  405. 'bg' => 'windows-1251',
  406. 'br' => '',
  407. 'ca' => 'iso-8859-15',
  408. 'ch' => 'gb2312',
  409. 'cs' => 'windows-1250',
  410. 'cz' => 'windows-1250',
  411. 'da' => '',
  412. 'de' => '',
  413. 'dk' => '',
  414. 'el' => 'iso-8859-7',
  415. 'eo' => 'utf-8',
  416. 'es' => '',
  417. 'et' => 'iso-8859-4',
  418. 'eu' => '',
  419. 'fa' => 'utf-8',
  420. 'fi' => '',
  421. 'fo' => 'utf-8',
  422. 'fr' => '',
  423. 'fr_CA' => '',
  424. 'ga' => '',
  425. 'ge' => 'utf-8',
  426. 'gl' => '',
  427. 'gr' => 'iso-8859-7',
  428. 'he' => 'utf-8',
  429. 'hi' => 'utf-8',
  430. 'hk' => 'big5',
  431. 'hr' => 'windows-1250',
  432. 'hu' => 'iso-8859-2',
  433. 'is' => 'utf-8',
  434. 'it' => '',
  435. 'ja' => 'shift_jis',
  436. 'jp' => 'shift_jis',
  437. 'ka' => 'utf-8',
  438. 'kl' => 'utf-8',
  439. 'km' => 'utf-8',
  440. 'ko' => 'euc-kr',
  441. 'kr' => 'euc-kr',
  442. 'lt' => 'windows-1257',
  443. 'lv' => 'utf-8',
  444. 'ms' => '',
  445. 'my' => '',
  446. 'nl' => '',
  447. 'no' => '',
  448. 'pl' => 'iso-8859-2',
  449. 'pt' => '',
  450. 'pt_BR' => '',
  451. 'qc' => '',
  452. 'ro' => 'iso-8859-2',
  453. 'ru' => 'windows-1251',
  454. 'se' => '',
  455. 'si' => 'windows-1250',
  456. 'sk' => 'windows-1250',
  457. 'sl' => 'windows-1250',
  458. 'sq' => 'utf-8',
  459. 'sr' => 'utf-8',
  460. 'sv' => '',
  461. 'th' => 'iso-8859-11',
  462. 'tr' => 'iso-8859-9',
  463. 'ua' => 'windows-1251',
  464. 'uk' => 'windows-1251',
  465. 'vi' => 'utf-8',
  466. 'vn' => 'utf-8',
  467. 'zh' => 'big5',
  468. );
  469. // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
  470. // Missing keys means: same as TYPO3
  471. // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
  472. var $isoArray = array(
  473. 'ba' => 'bs',
  474. 'br' => 'pt_BR',
  475. 'ch' => 'zh_CN',
  476. 'cz' => 'cs',
  477. 'dk' => 'da',
  478. 'si' => 'sl',
  479. 'se' => 'sv',
  480. 'gl' => 'kl',
  481. 'gr' => 'el',
  482. 'hk' => 'zh_HK',
  483. 'kr' => 'ko',
  484. 'ua' => 'uk',
  485. 'jp' => 'ja',
  486. 'qc' => 'fr_CA',
  487. 'vn' => 'vi',
  488. 'ge' => 'ka',
  489. 'ga' => 'gl',
  490. );
  491. /**
  492. * Default constructor.
  493. */
  494. public function __construct() {
  495. $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
  496. }
  497. /**
  498. * Normalize - changes input character set to lowercase letters.
  499. *
  500. * @param string Input charset
  501. * @return string Normalized charset
  502. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  503. */
  504. function parse_charset($charset) {
  505. $charset = trim(strtolower($charset));
  506. if (isset($this->synonyms[$charset])) {
  507. $charset = $this->synonyms[$charset];
  508. }
  509. return $charset;
  510. }
  511. /**
  512. * Get the charset of a locale.
  513. *
  514. * ln language
  515. * ln_CN language / country
  516. * ln_CN.cs language / country / charset
  517. * ln_CN.cs@mod language / country / charset / modifier
  518. *
  519. * @param string Locale string
  520. * @return string Charset resolved for locale string
  521. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  522. */
  523. function get_locale_charset($locale) {
  524. $locale = strtolower($locale);
  525. // exact locale specific charset?
  526. if (isset($this->locale_to_charset[$locale])) {
  527. return $this->locale_to_charset[$locale];
  528. }
  529. // get modifier
  530. list($locale, $modifier) = explode('@', $locale);
  531. // locale contains charset: use it
  532. list($locale, $charset) = explode('.', $locale);
  533. if ($charset) {
  534. return $this->parse_charset($charset);
  535. }
  536. // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
  537. if ($modifier == 'euro') {
  538. return 'iso-8859-15';
  539. }
  540. // get language
  541. list($language, $country) = explode('_', $locale);
  542. if (isset($this->lang_to_script[$language])) {
  543. $script = $this->lang_to_script[$language];
  544. }
  545. if (TYPO3_OS == 'WIN') {
  546. $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
  547. } else {
  548. $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
  549. }
  550. return $cs;
  551. }
  552. /********************************************
  553. *
  554. * Charset Conversion functions
  555. *
  556. ********************************************/
  557. /**
  558. * Convert from one charset to another charset.
  559. *
  560. * @param string Input string
  561. * @param string From charset (the current charset of the string)
  562. * @param string To charset (the output charset wanted)
  563. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  564. * @return string Converted string
  565. * @see convArray()
  566. */
  567. function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
  568. if ($fromCS == $toCS) {
  569. return $str;
  570. }
  571. // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
  572. if ($toCS == 'utf-8' || !$useEntityForNoChar) {
  573. switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
  574. case 'mbstring':
  575. $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
  576. if (FALSE !== $conv_str) {
  577. return $conv_str;
  578. } // returns FALSE for unsupported charsets
  579. break;
  580. case 'iconv':
  581. $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
  582. if (FALSE !== $conv_str) {
  583. return $conv_str;
  584. }
  585. break;
  586. case 'recode':
  587. $conv_str = recode_string($fromCS . '..' . $toCS, $str);
  588. if (FALSE !== $conv_str) {
  589. return $conv_str;
  590. }
  591. break;
  592. }
  593. // fallback to TYPO3 conversion
  594. }
  595. if ($fromCS != 'utf-8') {
  596. $str = $this->utf8_encode($str, $fromCS);
  597. }
  598. if ($toCS != 'utf-8') {
  599. $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
  600. }
  601. return $str;
  602. }
  603. /**
  604. * Convert all elements in ARRAY with type string from one charset to another charset.
  605. * NOTICE: Array is passed by reference!
  606. *
  607. * @param string Input array, possibly multidimensional
  608. * @param string From charset (the current charset of the string)
  609. * @param string To charset (the output charset wanted)
  610. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  611. * @return void
  612. * @see conv()
  613. */
  614. function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
  615. foreach ($array as $key => $value) {
  616. if (is_array($array[$key])) {
  617. $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  618. } elseif (is_string($array[$key])) {
  619. $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  620. }
  621. }
  622. }
  623. /**
  624. * Converts $str from $charset to UTF-8
  625. *
  626. * @param string String in local charset to convert to UTF-8
  627. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  628. * @return string Output string, converted to UTF-8
  629. */
  630. function utf8_encode($str, $charset) {
  631. if ($charset === 'utf-8') {
  632. return $str;
  633. }
  634. // Charset is case-insensitive.
  635. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  636. $strLen = strlen($str);
  637. $outStr = '';
  638. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
  639. $chr = substr($str, $a, 1);
  640. $ord = ord($chr);
  641. if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
  642. $ord2 = ord($str{$a + 1});
  643. $ord = $ord << 8 | $ord2; // assume big endian
  644. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  645. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  646. } else {
  647. $outStr .= chr($this->noCharByteVal);
  648. } // No char exists
  649. $a++;
  650. } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
  651. if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
  652. if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
  653. $a++;
  654. $ord2 = ord(substr($str, $a, 1));
  655. $ord = $ord * 256 + $ord2;
  656. }
  657. }
  658. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  659. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  660. } else {
  661. $outStr .= chr($this->noCharByteVal);
  662. } // No char exists
  663. } else {
  664. $outStr .= $chr;
  665. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  666. }
  667. return $outStr;
  668. }
  669. }
  670. /**
  671. * Converts $str from UTF-8 to $charset
  672. *
  673. * @param string String in UTF-8 to convert to local charset
  674. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  675. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  676. * @return string Output string, converted to local charset
  677. */
  678. function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
  679. if ($charset === 'utf-8') {
  680. return $str;
  681. }
  682. // Charset is case-insensitive.
  683. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  684. $strLen = strlen($str);
  685. $outStr = '';
  686. $buf = '';
  687. for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
  688. $chr = substr($str, $a, 1);
  689. $ord = ord($chr);
  690. if ($ord > 127) { // This means multibyte! (first byte!)
  691. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  692. $buf = $chr; // Add first byte
  693. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  694. $ord = $ord << 1; // Shift it left and ...
  695. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  696. $a++; // Increase pointer...
  697. $buf .= substr($str, $a, 1); // ... and add the next char.
  698. } else {
  699. break;
  700. }
  701. }
  702. if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
  703. $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
  704. if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
  705. $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
  706. } else {
  707. $outStr .= chr($mByte);
  708. }
  709. } elseif ($useEntityForNoChar) { // Create num entity:
  710. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  711. } else {
  712. $outStr .= chr($this->noCharByteVal);
  713. } // No char exists
  714. } else {
  715. $outStr .= chr($this->noCharByteVal);
  716. } // No char exists (MIDDLE of MB sequence!)
  717. } else {
  718. $outStr .= $chr;
  719. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  720. }
  721. return $outStr;
  722. }
  723. }
  724. /**
  725. * Converts all chars > 127 to numeric entities.
  726. *
  727. * @param string Input string
  728. * @return string Output string
  729. */
  730. function utf8_to_entities($str) {
  731. $strLen = strlen($str);
  732. $outStr = '';
  733. $buf = '';
  734. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  735. $chr = substr($str, $a, 1);
  736. $ord = ord($chr);
  737. if ($ord > 127) { // This means multibyte! (first byte!)
  738. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  739. $buf = $chr; // Add first byte
  740. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  741. $ord = $ord << 1; // Shift it left and ...
  742. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  743. $a++; // Increase pointer...
  744. $buf .= substr($str, $a, 1); // ... and add the next char.
  745. } else {
  746. break;
  747. }
  748. }
  749. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  750. } else {
  751. $outStr .= chr($this->noCharByteVal);
  752. } // No char exists (MIDDLE of MB sequence!)
  753. } else {
  754. $outStr .= $chr;
  755. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  756. }
  757. return $outStr;
  758. }
  759. /**
  760. * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
  761. *
  762. * @param string Input string, UTF-8
  763. * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
  764. * @return string Output string
  765. */
  766. function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
  767. // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
  768. // see http://php.net/manual/en/function.get-html-translation-table.php
  769. $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
  770. if ($alsoStdHtmlEnt) {
  771. if ($applyPhpCompatibilityFix === TRUE) {
  772. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
  773. } else {
  774. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
  775. }
  776. }
  777. $token = md5(microtime());
  778. $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
  779. foreach ($parts as $k => $v) {
  780. // only take every second element
  781. if ($k % 2 === 0) {
  782. continue;
  783. }
  784. $position = 0;
  785. if (substr($v, $position, 1) == '#') { // Dec or hex entities:
  786. $position++;
  787. if (substr($v, $position, 1) == 'x') {
  788. $v = hexdec(substr($v, ++$position));
  789. } else {
  790. $v = substr($v, $position);
  791. }
  792. $parts[$k] = $this->UnumberToChar($v);
  793. } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
  794. $v = $trans_tbl['&' . $v . ';'];
  795. if ($applyPhpCompatibilityFix === TRUE) {
  796. $v = $this->utf8_encode($v, 'iso-8859-1');
  797. }
  798. $parts[$k] = $v;
  799. } else { // No conversion:
  800. $parts[$k] = '&' . $v . ';';
  801. }
  802. }
  803. return implode('', $parts);
  804. }
  805. /**
  806. * Converts all chars in the input UTF-8 string into integer numbers returned in an array
  807. *
  808. * @param string Input string, UTF-8
  809. * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
  810. * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
  811. * @return array Output array with the char numbers
  812. */
  813. function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
  814. // If entities must be registered as well...:
  815. if ($convEntities) {
  816. $str = $this->entities_to_utf8($str, 1);
  817. }
  818. // Do conversion:
  819. $strLen = strlen($str);
  820. $outArr = array();
  821. $buf = '';
  822. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  823. $chr = substr($str, $a, 1);
  824. $ord = ord($chr);
  825. if ($ord > 127) { // This means multibyte! (first byte!)
  826. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  827. $buf = $chr; // Add first byte
  828. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  829. $ord = $ord << 1; // Shift it left and ...
  830. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  831. $a++; // Increase pointer...
  832. $buf .= substr($str, $a, 1); // ... and add the next char.
  833. } else {
  834. break;
  835. }
  836. }
  837. $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
  838. } else {
  839. $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
  840. } // No char exists (MIDDLE of MB sequence!)
  841. } else {
  842. $outArr[] = $retChar ? chr($ord) : $ord;
  843. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  844. }
  845. return $outArr;
  846. }
  847. /**
  848. * Converts a UNICODE number to a UTF-8 multibyte character
  849. * Algorithm based on script found at From: http://czyborra.com/utf/
  850. * Unit-tested by Kasper
  851. *
  852. * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
  853. *
  854. * bytes | bits | representation
  855. * 1 | 7 | 0vvvvvvv
  856. * 2 | 11 | 110vvvvv 10vvvvvv
  857. * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
  858. * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  859. * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  860. * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  861. *
  862. * @param integer UNICODE integer
  863. * @return string UTF-8 multibyte character string
  864. * @see utf8CharToUnumber()
  865. */
  866. function UnumberToChar($cbyte) {
  867. $str = '';
  868. if ($cbyte < 0x80) {
  869. $str .= chr($cbyte);
  870. } else {
  871. if ($cbyte < 0x800) {
  872. $str .= chr(0xC0 | ($cbyte >> 6));
  873. $str .= chr(0x80 | ($cbyte & 0x3F));
  874. } else {
  875. if ($cbyte < 0x10000) {
  876. $str .= chr(0xE0 | ($cbyte >> 12));
  877. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  878. $str .= chr(0x80 | ($cbyte & 0x3F));
  879. } else {
  880. if ($cbyte < 0x200000) {
  881. $str .= chr(0xF0 | ($cbyte >> 18));
  882. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  883. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  884. $str .= chr(0x80 | ($cbyte & 0x3F));
  885. } else {
  886. if ($cbyte < 0x4000000) {
  887. $str .= chr(0xF8 | ($cbyte >> 24));
  888. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  889. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  890. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  891. $str .= chr(0x80 | ($cbyte & 0x3F));
  892. } else {
  893. if ($cbyte < 0x80000000) {
  894. $str .= chr(0xFC | ($cbyte >> 30));
  895. $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
  896. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  897. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  898. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  899. $str .= chr(0x80 | ($cbyte & 0x3F));
  900. } else { // Cannot express a 32-bit character in UTF-8
  901. $str .= chr($this->noCharByteVal);
  902. }
  903. }
  904. }
  905. }
  906. }
  907. }
  908. return $str;
  909. }
  910. /**
  911. * Converts a UTF-8 Multibyte character to a UNICODE number
  912. * Unit-tested by Kasper
  913. *
  914. * @param string UTF-8 multibyte character string
  915. * @param boolean If set, then a hex. number is returned.
  916. * @return integer UNICODE integer
  917. * @see UnumberToChar()
  918. */
  919. function utf8CharToUnumber($str, $hex = 0) {
  920. $ord = ord(substr($str, 0, 1)); // First char
  921. if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
  922. $binBuf = '';
  923. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  924. $ord = $ord << 1; // Shift it left and ...
  925. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  926. $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
  927. } else {
  928. break;
  929. }
  930. }
  931. $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
  932. $int = bindec($binBuf);
  933. } else {
  934. $int = $ord;
  935. }
  936. return $hex ? 'x' . dechex($int) : $int;
  937. }
  938. /********************************************
  939. *
  940. * Init functions
  941. *
  942. ********************************************/
  943. /**
  944. * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
  945. * This function is automatically called by the conversion functions
  946. *
  947. * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
  948. *
  949. * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
  950. * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
  951. * @access private
  952. */
  953. function initCharset($charset) {
  954. // Only process if the charset is not yet loaded:
  955. if (!is_array($this->parsedCharsets[$charset])) {
  956. // Conversion table filename:
  957. $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
  958. // If the conversion table is found:
  959. if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
  960. // Cache file for charsets:
  961. // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
  962. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
  963. if ($cacheFile && @is_file($cacheFile)) {
  964. $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  965. } else {
  966. // Parse conversion table into lines:
  967. $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
  968. // Initialize the internal variable holding the conv. table:
  969. $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
  970. // traverse the lines:
  971. $detectedType = '';
  972. foreach ($lines as $value) {
  973. if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
  974. // Detect type if not done yet: (Done on first real line)
  975. // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
  976. if (!$detectedType) {
  977. $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
  978. }
  979. if ($detectedType == 'ms-token') {
  980. list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
  981. } elseif ($detectedType == 'whitespaced') {
  982. $regA = array();
  983. preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
  984. $hexbyte = $regA[1];
  985. $utf8 = 'U+' . $regA[2];
  986. }
  987. $decval = hexdec(trim($hexbyte));
  988. if ($decval > 127) {
  989. $utf8decval = hexdec(substr(trim($utf8), 2));
  990. $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
  991. $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
  992. }
  993. }
  994. }
  995. if ($cacheFile) {
  996. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
  997. }
  998. }
  999. return 2;
  1000. } else {
  1001. return FALSE;
  1002. }
  1003. } else {
  1004. return 1;
  1005. }
  1006. }
  1007. /**
  1008. * This function initializes all UTF-8 character data tables.
  1009. *
  1010. * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
  1011. *
  1012. * @param string Mode ("case", "ascii", ...)
  1013. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1014. * @access private
  1015. */
  1016. function initUnicodeData($mode = NULL) {
  1017. // cache files
  1018. $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
  1019. $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
  1020. // Only process if the tables are not yet loaded
  1021. switch ($mode) {
  1022. case 'case':
  1023. if (is_array($this->caseFolding['utf-8'])) {
  1024. return 1;
  1025. }
  1026. // Use cached version if possible
  1027. if ($cacheFileCase && @is_file($cacheFileCase)) {
  1028. $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
  1029. return 2;
  1030. }
  1031. break;
  1032. case 'ascii':
  1033. if (is_array($this->toASCII['utf-8'])) {
  1034. return 1;
  1035. }
  1036. // Use cached version if possible
  1037. if ($cacheFileASCII && @is_file($cacheFileASCII)) {
  1038. $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
  1039. return 2;
  1040. }
  1041. break;
  1042. }
  1043. // process main Unicode data file
  1044. $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
  1045. if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
  1046. return FALSE;
  1047. }
  1048. $fh = fopen($unicodeDataFile, 'rb');
  1049. if (!$fh) {
  1050. return FALSE;
  1051. }
  1052. // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
  1053. // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
  1054. $this->caseFolding['utf-8'] = array();
  1055. $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
  1056. $utf8CaseFolding['toUpper'] = array();
  1057. $utf8CaseFolding['toLower'] = array();
  1058. $utf8CaseFolding['toTitle'] = array();
  1059. $decomposition = array(); // array of temp. decompositions
  1060. $mark = array(); // array of chars that are marks (eg. composing accents)
  1061. $number = array(); // array of chars that are numbers (eg. digits)
  1062. $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
  1063. while (!feof($fh)) {
  1064. $line = fgets($fh, 4096);
  1065. // has a lot of info
  1066. list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
  1067. $ord = hexdec($char);
  1068. if ($ord > 0xFFFF) {
  1069. break;
  1070. } // only process the BMP
  1071. $utf8_char = $this->UnumberToChar($ord);
  1072. if ($upper) {
  1073. $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
  1074. }
  1075. if ($lower) {
  1076. $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
  1077. }
  1078. // store "title" only when different from "upper" (only a few)
  1079. if ($title && $title != $upper) {
  1080. $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
  1081. }
  1082. switch ($cat{0}) {
  1083. case 'M': // mark (accent, umlaut, ...)
  1084. $mark["U+$char"] = 1;
  1085. break;
  1086. case 'N': // numeric value
  1087. if ($ord > 0x80 && $num != '') {
  1088. $number["U+$char"] = $num;
  1089. }
  1090. }
  1091. // accented Latin letters without "official" decomposition
  1092. $match = array();
  1093. if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
  1094. $c = ord($match[2]);
  1095. if ($match[1] == 'SMALL') {
  1096. $c += 32;
  1097. }
  1098. $decomposition["U+$char"] = array(dechex($c));
  1099. continue;
  1100. }
  1101. $match = array();
  1102. if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
  1103. switch ($match[1]) {
  1104. case '<circle>': // add parenthesis as circle replacement, eg (1)
  1105. $match[2] = '0028 ' . $match[2] . ' 0029';
  1106. break;
  1107. case '<square>': // add square brackets as square replacement, eg [1]
  1108. $match[2] = '005B ' . $match[2] . ' 005D';
  1109. break;
  1110. case '<compat>': // ignore multi char decompositions that start with a space
  1111. if (preg_match('/^0020 /', $match[2])) {
  1112. continue 2;
  1113. }
  1114. break;
  1115. // ignore Arabic and vertical layout presentation decomposition
  1116. case '<initial>':
  1117. case '<medial>':
  1118. case '<final>':
  1119. case '<isolated>':
  1120. case '<vertical>':
  1121. continue 2;
  1122. }
  1123. $decomposition["U+$char"] = explode(' ', $match[2]);
  1124. }
  1125. }
  1126. fclose($fh);
  1127. // process additional Unicode data for casing (allow folded characters to expand into a sequence)
  1128. $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
  1129. if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
  1130. $fh = fopen($specialCasingFile, 'rb');
  1131. if ($fh) {
  1132. while (!feof($fh)) {
  1133. $line = fgets($fh, 4096);
  1134. if ($line{0} != '#' && trim($line) != '') {
  1135. list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
  1136. if ($cond == '' || $cond{0} == '#') {
  1137. $utf8_char = $this->UnumberToChar(hexdec($char));
  1138. if ($char != $lower) {
  1139. $arr = explode(' ', $lower);
  1140. for ($i = 0; isset($arr[$i]); $i++) {
  1141. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1142. }
  1143. $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
  1144. }
  1145. if ($char != $title && $title != $upper) {
  1146. $arr = explode(' ', $title);
  1147. for ($i = 0; isset($arr[$i]); $i++) {
  1148. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1149. }
  1150. $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
  1151. }
  1152. if ($char != $upper) {
  1153. $arr = explode(' ', $upper);
  1154. for ($i = 0; isset($arr[$i]); $i++) {
  1155. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1156. }
  1157. $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
  1158. }
  1159. }
  1160. }
  1161. }
  1162. fclose($fh);
  1163. }
  1164. }
  1165. // process custom decompositions
  1166. $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
  1167. if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
  1168. $fh = fopen($customTranslitFile, 'rb');
  1169. if ($fh) {
  1170. while (!feof($fh)) {
  1171. $line = fgets($fh, 4096);
  1172. if ($line{0} != '#' && trim($line) != '') {
  1173. list($char, $translit) = t3lib_div::trimExplode(';', $line);
  1174. if (!$translit) {
  1175. $omit["U+$char"] = 1;
  1176. }
  1177. $decomposition["U+$char"] = explode(' ', $translit);
  1178. }
  1179. }
  1180. fclose($fh);
  1181. }
  1182. }
  1183. // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
  1184. foreach ($decomposition as $from => $to) {
  1185. $code_decomp = array();
  1186. while ($code_value = array_shift($to)) {
  1187. if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
  1188. foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
  1189. array_unshift($to, $cv);
  1190. }
  1191. } elseif (!isset($mark["U+$code_value"])) { // remove mark
  1192. array_push($code_decomp, $code_value);
  1193. }
  1194. }
  1195. if (count($code_decomp) || isset($omit[$from])) {
  1196. $decomposition[$from] = $code_decomp;
  1197. } else {
  1198. unset($decomposition[$from]);
  1199. }
  1200. }
  1201. // create ascii only mapping
  1202. $this->toASCII['utf-8'] = array();
  1203. $ascii =& $this->toASCII['utf-8'];
  1204. foreach ($decomposition as $from => $to) {
  1205. $code_decomp = array();
  1206. while ($code_value = array_shift($to)) {
  1207. $ord = hexdec($code_value);
  1208. if ($ord > 127) {
  1209. continue 2;
  1210. } // skip decompositions containing non-ASCII chars
  1211. else
  1212. {
  1213. array_push($code_decomp, chr($ord));
  1214. }
  1215. }
  1216. $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
  1217. }
  1218. // add numeric decompositions
  1219. foreach ($number as $from => $to) {
  1220. $utf8_char = $this->UnumberToChar(hexdec($from));
  1221. if (!isset($ascii[$utf8_char])) {
  1222. $ascii[$utf8_char] = $to;
  1223. }
  1224. }
  1225. if ($cacheFileCase) {
  1226. t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
  1227. }
  1228. if ($cacheFileASCII) {
  1229. t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
  1230. }
  1231. return 3;
  1232. }
  1233. /**
  1234. * This function initializes the folding table for a charset other than UTF-8.
  1235. * This function is automatically called by the case folding functions.
  1236. *
  1237. * @param string Charset for which to initialize case folding.
  1238. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1239. * @access private
  1240. */
  1241. function initCaseFolding($charset) {
  1242. // Only process if the case table is not yet loaded:
  1243. if (is_array($this->caseFolding[$charset])) {
  1244. return 1;
  1245. }
  1246. // Use cached version if possible
  1247. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
  1248. if ($cacheFile && @is_file($cacheFile)) {
  1249. $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1250. return 2;
  1251. }
  1252. // init UTF-8 conversion for this charset
  1253. if (!$this->initCharset($charset)) {
  1254. return FALSE;
  1255. }
  1256. // UTF-8 case folding is used as the base conversion table
  1257. if (!$this->initUnicodeData('case')) {
  1258. return FALSE;
  1259. }
  1260. $nochar = chr($this->noCharByteVal);
  1261. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1262. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1263. $c = $this->utf8_decode($utf8, $charset);
  1264. // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
  1265. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
  1266. if ($cc != '' && $cc != $nochar) {
  1267. $this->caseFolding[$charset]['toUpper'][$c] = $cc;
  1268. }
  1269. // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
  1270. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
  1271. if ($cc != '' && $cc != $nochar) {
  1272. $this->caseFolding[$charset]['toLower'][$c] = $cc;
  1273. }
  1274. // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
  1275. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
  1276. if ($cc != '' && $cc != $nochar) {
  1277. $this->caseFolding[$charset]['toTitle'][$c] = $cc;
  1278. }
  1279. }
  1280. // add the ASCII case table
  1281. for ($i = ord('a'); $i <= ord('z'); $i++) {
  1282. $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
  1283. }
  1284. for ($i = ord('A'); $i <= ord('Z'); $i++) {
  1285. $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
  1286. }
  1287. if ($cacheFile) {
  1288. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
  1289. }
  1290. return 3;
  1291. }
  1292. /**
  1293. * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
  1294. * This function is automatically called by the ASCII transliteration functions.
  1295. *
  1296. * @param string Charset for which to initialize conversion.
  1297. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1298. * @access private
  1299. */
  1300. function initToASCII($charset) {
  1301. // Only process if the case table is not yet loaded:
  1302. if (is_array($this->toASCII[$charset])) {
  1303. return 1;
  1304. }
  1305. // Use cached version if possible
  1306. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
  1307. if ($cacheFile && @is_file($cacheFile)) {
  1308. $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1309. return 2;
  1310. }
  1311. // init UTF-8 conversion for this charset
  1312. if (!$this->initCharset($charset)) {
  1313. return FALSE;
  1314. }
  1315. // UTF-8/ASCII transliteration is used as the base conversion table
  1316. if (!$this->initUnicodeData('ascii')) {
  1317. return FALSE;
  1318. }
  1319. $nochar = chr($this->noCharByteVal);
  1320. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1321. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1322. $c = $this->utf8_decode($utf8, $charset);
  1323. if (isset($this->toASCII['utf-8'][$utf8])) {
  1324. $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
  1325. }
  1326. }
  1327. if ($cacheFile) {
  1328. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
  1329. }
  1330. return 3;
  1331. }
  1332. /********************************************
  1333. *
  1334. * String operation functions
  1335. *
  1336. ********************************************/
  1337. /**
  1338. * Returns a part of a string.
  1339. * Unit-tested by Kasper (single byte charsets only)
  1340. *
  1341. * @param string The character set
  1342. * @param string Character string
  1343. * @param integer Start position (character position)
  1344. * @param integer Length (in characters)
  1345. * @return string The substring
  1346. * @see substr(), mb_substr()
  1347. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1348. */
  1349. function substr($charset, $string, $start, $len = NULL) {
  1350. if ($len === 0 || $string === '') {
  1351. return '';
  1352. }
  1353. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1354. // cannot omit $len, when specifying charset
  1355. if ($len == NULL) {
  1356. $enc = mb_internal_encoding(); // save internal encoding
  1357. mb_internal_encoding($charset);
  1358. $str = mb_substr($string, $start);
  1359. mb_internal_encoding($enc); // restore internal encoding
  1360. return $str;
  1361. }
  1362. else {
  1363. return mb_substr($string, $start, $len, $charset);
  1364. }
  1365. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1366. // cannot omit $len, when specifying charset
  1367. if ($len == NULL) {
  1368. $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
  1369. iconv_set_encoding('internal_encoding', $charset);
  1370. $str = iconv_substr($string, $start);
  1371. iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
  1372. return $str;
  1373. }
  1374. else {
  1375. return iconv_substr($string, $start, $len, $charset);
  1376. }
  1377. } elseif ($charset == 'utf-8') {
  1378. return $this->utf8_substr($string, $start, $len);
  1379. } elseif ($this->eucBasedSets[$charset]) {
  1380. return $this->euc_substr($string, $start, $charset, $len);
  1381. } elseif ($this->twoByteSets[$charset]) {
  1382. return substr($string, $start * 2, $len * 2);
  1383. } elseif ($this->fourByteSets[$charset]) {
  1384. return substr($string, $start * 4, $len * 4);
  1385. }
  1386. // treat everything else as single-byte encoding
  1387. return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
  1388. }
  1389. /**
  1390. * Counts the number of characters.
  1391. * Unit-tested by Kasper (single byte charsets only)
  1392. *
  1393. * @param string The character set
  1394. * @param string Character string
  1395. * @return integer The number of characters
  1396. * @see strlen()
  1397. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>

Large files files are truncated, but you can click here to view the full file