PageRenderTime 32ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/typo3/class.t3lib_cs.php

https://bitbucket.org/synergylearning/campusconnect
PHP | 2367 lines | 1573 code | 218 blank | 576 comment | 376 complexity | 87eb15c41b0d2fb4a78985187ca298ea MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, GPL-3.0, LGPL-2.1, Apache-2.0, BSD-3-Clause, AGPL-3.0
  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the Typo3 project. The Typo3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. *
  17. * This script is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * This copyright notice MUST APPEAR in all copies of the script!
  23. ***************************************************************/
  24. /**
  25. * Class for conversion between charsets.
  26. *
  27. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  28. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  29. */
  30. /**
  31. * Notes on UTF-8
  32. *
  33. * Functions working on UTF-8 strings:
  34. *
  35. * - strchr/strstr
  36. * - strrchr
  37. * - substr_count
  38. * - implode/explode/join
  39. *
  40. * Functions nearly working on UTF-8 strings:
  41. *
  42. * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  43. * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  44. * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  45. * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  46. * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  47. *
  48. * Functions NOT working on UTF-8 strings:
  49. *
  50. * - str*cmp
  51. * - stristr
  52. * - stripos
  53. * - substr
  54. * - strrev
  55. * - split/spliti
  56. * - ...
  57. *
  58. */
  59. /**
  60. * Class for conversion between charsets
  61. *
  62. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  63. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  64. * @package TYPO3
  65. * @subpackage t3lib
  66. */
  67. class t3lib_cs {
  68. /**
  69. * @var t3lib_l10n_Locales
  70. */
  71. protected $locales;
  72. var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  73. // This is the array where parsed conversion tables are stored (cached)
  74. var $parsedCharsets = array();
  75. // An array where case folding data will be stored (cached)
  76. var $caseFolding = array();
  77. // An array where charset-to-ASCII mappings are stored (cached)
  78. var $toASCII = array();
  79. // This tells the converter which charsets has two bytes per char:
  80. var $twoByteSets = array(
  81. 'ucs-2' => 1, // 2-byte Unicode
  82. );
  83. // This tells the converter which charsets has four bytes per char:
  84. var $fourByteSets = array(
  85. 'ucs-4' => 1, // 4-byte Unicode
  86. 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  87. );
  88. // This tells the converter which charsets use a scheme like the Extended Unix Code:
  89. var $eucBasedSets = array(
  90. 'gb2312' => 1, // Chinese, simplified.
  91. 'big5' => 1, // Chinese, traditional.
  92. 'euc-kr' => 1, // Korean
  93. 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
  94. );
  95. // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
  96. // http://czyborra.com/charsets/iso8859.html
  97. var $synonyms = array(
  98. 'us' => 'ascii',
  99. 'us-ascii' => 'ascii',
  100. 'cp819' => 'iso-8859-1',
  101. 'ibm819' => 'iso-8859-1',
  102. 'iso-ir-100' => 'iso-8859-1',
  103. 'iso-ir-101' => 'iso-8859-2',
  104. 'iso-ir-109' => 'iso-8859-3',
  105. 'iso-ir-110' => 'iso-8859-4',
  106. 'iso-ir-144' => 'iso-8859-5',
  107. 'iso-ir-127' => 'iso-8859-6',
  108. 'iso-ir-126' => 'iso-8859-7',
  109. 'iso-ir-138' => 'iso-8859-8',
  110. 'iso-ir-148' => 'iso-8859-9',
  111. 'iso-ir-157' => 'iso-8859-10',
  112. 'iso-ir-179' => 'iso-8859-13',
  113. 'iso-ir-199' => 'iso-8859-14',
  114. 'iso-ir-203' => 'iso-8859-15',
  115. 'csisolatin1' => 'iso-8859-1',
  116. 'csisolatin2' => 'iso-8859-2',
  117. 'csisolatin3' => 'iso-8859-3',
  118. 'csisolatin5' => 'iso-8859-9',
  119. 'csisolatin8' => 'iso-8859-14',
  120. 'csisolatin9' => 'iso-8859-15',
  121. 'csisolatingreek' => 'iso-8859-7',
  122. 'iso-celtic' => 'iso-8859-14',
  123. 'latin1' => 'iso-8859-1',
  124. 'latin2' => 'iso-8859-2',
  125. 'latin3' => 'iso-8859-3',
  126. 'latin5' => 'iso-8859-9',
  127. 'latin6' => 'iso-8859-10',
  128. 'latin8' => 'iso-8859-14',
  129. 'latin9' => 'iso-8859-15',
  130. 'l1' => 'iso-8859-1',
  131. 'l2' => 'iso-8859-2',
  132. 'l3' => 'iso-8859-3',
  133. 'l5' => 'iso-8859-9',
  134. 'l6' => 'iso-8859-10',
  135. 'l8' => 'iso-8859-14',
  136. 'l9' => 'iso-8859-15',
  137. 'cyrillic' => 'iso-8859-5',
  138. 'arabic' => 'iso-8859-6',
  139. 'tis-620' => 'iso-8859-11',
  140. 'win874' => 'windows-874',
  141. 'win1250' => 'windows-1250',
  142. 'win1251' => 'windows-1251',
  143. 'win1252' => 'windows-1252',
  144. 'win1253' => 'windows-1253',
  145. 'win1254' => 'windows-1254',
  146. 'win1255' => 'windows-1255',
  147. 'win1256' => 'windows-1256',
  148. 'win1257' => 'windows-1257',
  149. 'win1258' => 'windows-1258',
  150. 'cp1250' => 'windows-1250',
  151. 'cp1251' => 'windows-1251',
  152. 'cp1252' => 'windows-1252',
  153. 'ms-ee' => 'windows-1250',
  154. 'ms-ansi' => 'windows-1252',
  155. 'ms-greek' => 'windows-1253',
  156. 'ms-turk' => 'windows-1254',
  157. 'winbaltrim' => 'windows-1257',
  158. 'koi-8ru' => 'koi-8r',
  159. 'koi8r' => 'koi-8r',
  160. 'cp878' => 'koi-8r',
  161. 'mac' => 'macroman',
  162. 'macintosh' => 'macroman',
  163. 'euc-cn' => 'gb2312',
  164. 'x-euc-cn' => 'gb2312',
  165. 'euccn' => 'gb2312',
  166. 'cp936' => 'gb2312',
  167. 'big-5' => 'big5',
  168. 'cp950' => 'big5',
  169. 'eucjp' => 'euc-jp',
  170. 'sjis' => 'shift_jis',
  171. 'shift-jis' => 'shift_jis',
  172. 'cp932' => 'shift_jis',
  173. 'cp949' => 'euc-kr',
  174. 'utf7' => 'utf-7',
  175. 'utf8' => 'utf-8',
  176. 'utf16' => 'utf-16',
  177. 'utf32' => 'utf-32',
  178. 'utf8' => 'utf-8',
  179. 'ucs2' => 'ucs-2',
  180. 'ucs4' => 'ucs-4',
  181. );
  182. // mapping of iso-639-1 language codes to script names
  183. var $lang_to_script = array(
  184. // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
  185. 'af' => 'west_european', //Afrikaans
  186. 'ar' => 'arabic',
  187. 'bg' => 'cyrillic', // Bulgarian
  188. 'bs' => 'east_european', // Bosnian
  189. 'cs' => 'east_european', // Czech
  190. 'da' => 'west_european', // Danish
  191. 'de' => 'west_european', // German
  192. 'es' => 'west_european', // Spanish
  193. 'et' => 'estonian',
  194. 'eo' => 'unicode', // Esperanto
  195. 'eu' => 'west_european', // Basque
  196. 'fa' => 'arabic', // Persian
  197. 'fi' => 'west_european', // Finish
  198. 'fo' => 'west_european', // Faroese
  199. 'fr' => 'west_european', // French
  200. 'ga' => 'west_european', // Irish
  201. 'gl' => 'west_european', // Galician
  202. 'gr' => 'greek',
  203. 'he' => 'hebrew', // Hebrew (since 1998)
  204. 'hi' => 'unicode', // Hindi
  205. 'hr' => 'east_european', // Croatian
  206. 'hu' => 'east_european', // Hungarian
  207. 'iw' => 'hebrew', // Hebrew (til 1998)
  208. 'is' => 'west_european', // Icelandic
  209. 'it' => 'west_european', // Italian
  210. 'ja' => 'japanese',
  211. 'ka' => 'unicode', // Georgian
  212. 'kl' => 'west_european', // Greenlandic
  213. 'km' => 'unicode', // Khmer
  214. 'ko' => 'korean',
  215. 'lt' => 'lithuanian',
  216. 'lv' => 'west_european', // Latvian/Lettish
  217. 'nl' => 'west_european', // Dutch
  218. 'no' => 'west_european', // Norwegian
  219. 'nb' => 'west_european', // Norwegian Bokmal
  220. 'nn' => 'west_european', // Norwegian Nynorsk
  221. 'pl' => 'east_european', // Polish
  222. 'pt' => 'west_european', // Portuguese
  223. 'ro' => 'east_european', // Romanian
  224. 'ru' => 'cyrillic', // Russian
  225. 'sk' => 'east_european', // Slovak
  226. 'sl' => 'east_european', // Slovenian
  227. 'sr' => 'cyrillic', // Serbian
  228. 'sv' => 'west_european', // Swedish
  229. 'sq' => 'albanian', // Albanian
  230. 'th' => 'thai',
  231. 'uk' => 'cyrillic', // Ukranian
  232. 'vi' => 'vietnamese',
  233. 'zh' => 'chinese',
  234. // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
  235. // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
  236. 'afk'=> 'west_european', // Afrikaans
  237. 'ara' => 'arabic',
  238. 'bgr' => 'cyrillic', // Bulgarian
  239. 'cat' => 'west_european', // Catalan
  240. 'chs' => 'simpl_chinese',
  241. 'cht' => 'trad_chinese',
  242. 'csy' => 'east_european', // Czech
  243. 'dan' => 'west_european', // Danisch
  244. 'deu' => 'west_european', // German
  245. 'dea' => 'west_european', // German (Austrian)
  246. 'des' => 'west_european', // German (Swiss)
  247. 'ena' => 'west_european', // English (Australian)
  248. 'enc' => 'west_european', // English (Canadian)
  249. 'eng' => 'west_european', // English
  250. 'enz' => 'west_european', // English (New Zealand)
  251. 'enu' => 'west_european', // English (United States)
  252. 'euq' => 'west_european', // Basque
  253. 'fos' => 'west_european', // Faroese
  254. 'far' => 'arabic', // Persian
  255. 'fin' => 'west_european', // Finish
  256. 'fra' => 'west_european', // French
  257. 'frb' => 'west_european', // French (Belgian)
  258. 'frc' => 'west_european', // French (Canadian)
  259. 'frs' => 'west_european', // French (Swiss)
  260. 'geo' => 'unicode', // Georgian
  261. 'glg' => 'west_european', // Galician
  262. 'ell' => 'greek',
  263. 'heb' => 'hebrew',
  264. 'hin' => 'unicode', // Hindi
  265. 'hun' => 'east_european', // Hungarian
  266. 'isl' => 'west_euorpean', // Icelandic
  267. 'ita' => 'west_european', // Italian
  268. 'its' => 'west_european', // Italian (Swiss)
  269. 'jpn' => 'japanese',
  270. 'khm' => 'unicode', // Khmer
  271. 'kor' => 'korean',
  272. 'lth' => 'lithuanian',
  273. 'lvi' => 'west_european', // Latvian/Lettish
  274. 'msl' => 'west_european', // Malay
  275. 'nlb' => 'west_european', // Dutch (Belgian)
  276. 'nld' => 'west_european', // Dutch
  277. 'nor' => 'west_european', // Norwegian (bokmal)
  278. 'non' => 'west_european', // Norwegian (nynorsk)
  279. 'plk' => 'east_european', // Polish
  280. 'ptg' => 'west_european', // Portuguese
  281. 'ptb' => 'west_european', // Portuguese (Brazil)
  282. 'rom' => 'east_european', // Romanian
  283. 'rus' => 'cyrillic', // Russian
  284. 'slv' => 'east_european', // Slovenian
  285. 'sky' => 'east_european', // Slovak
  286. 'srl' => 'east_european', // Serbian (Latin)
  287. 'srb' => 'cyrillic', // Serbian (Cyrillic)
  288. 'esp' => 'west_european', // Spanish (trad. sort)
  289. 'esm' => 'west_european', // Spanish (Mexican)
  290. 'esn' => 'west_european', // Spanish (internat. sort)
  291. 'sve' => 'west_european', // Swedish
  292. 'sqi' => 'albanian', // Albanian
  293. 'tha' => 'thai',
  294. 'trk' => 'turkish',
  295. 'ukr' => 'cyrillic', // Ukrainian
  296. // English language names
  297. 'afrikaans' => 'west_european',
  298. 'albanian' => 'albanian',
  299. 'arabic' => 'arabic',
  300. 'basque' => 'west_european',
  301. 'bosnian' => 'east_european',
  302. 'bulgarian' => 'east_european',
  303. 'catalan' => 'west_european',
  304. 'croatian' => 'east_european',
  305. 'czech' => 'east_european',
  306. 'danish' => 'west_european',
  307. 'dutch' => 'west_european',
  308. 'english' => 'west_european',
  309. 'esperanto' => 'unicode',
  310. 'estonian' => 'estonian',
  311. 'faroese' => 'west_european',
  312. 'farsi' => 'arabic',
  313. 'finnish' => 'west_european',
  314. 'french' => 'west_european',
  315. 'galician' => 'west_european',
  316. 'georgian' => 'unicode',
  317. 'german' => 'west_european',
  318. 'greek' => 'greek',
  319. 'greenlandic' => 'west_european',
  320. 'hebrew' => 'hebrew',
  321. 'hindi' => 'unicode',
  322. 'hungarian' => 'east_european',
  323. 'icelandic' => 'west_european',
  324. 'italian' => 'west_european',
  325. 'khmer' => 'unicode',
  326. 'latvian' => 'west_european',
  327. 'lettish' => 'west_european',
  328. 'lithuanian' => 'lithuanian',
  329. 'malay' => 'west_european',
  330. 'norwegian' => 'west_european',
  331. 'persian' => 'arabic',
  332. 'polish' => 'east_european',
  333. 'portuguese' => 'west_european',
  334. 'russian' => 'cyrillic',
  335. 'romanian' => 'east_european',
  336. 'serbian' => 'cyrillic',
  337. 'slovak' => 'east_european',
  338. 'slovenian' => 'east_european',
  339. 'spanish' => 'west_european',
  340. 'svedish' => 'west_european',
  341. 'that' => 'thai',
  342. 'turkish' => 'turkish',
  343. 'ukrainian' => 'cyrillic',
  344. );
  345. // mapping of language (family) names to charsets on Unix
  346. var $script_to_charset_unix = array(
  347. 'west_european' => 'iso-8859-1',
  348. 'estonian' => 'iso-8859-1',
  349. 'east_european' => 'iso-8859-2',
  350. 'baltic' => 'iso-8859-4',
  351. 'cyrillic' => 'iso-8859-5',
  352. 'arabic' => 'iso-8859-6',
  353. 'greek' => 'iso-8859-7',
  354. 'hebrew' => 'iso-8859-8',
  355. 'turkish' => 'iso-8859-9',
  356. 'thai' => 'iso-8859-11', // = TIS-620
  357. 'lithuanian' => 'iso-8859-13',
  358. 'chinese' => 'gb2312', // = euc-cn
  359. 'japanese' => 'euc-jp',
  360. 'korean' => 'euc-kr',
  361. 'simpl_chinese' => 'gb2312',
  362. 'trad_chinese' => 'big5',
  363. 'vietnamese' => '',
  364. 'unicode' => 'utf-8',
  365. 'albanian' => 'utf-8'
  366. );
  367. // mapping of language (family) names to charsets on Windows
  368. var $script_to_charset_windows = array(
  369. 'east_european' => 'windows-1250',
  370. 'cyrillic' => 'windows-1251',
  371. 'west_european' => 'windows-1252',
  372. 'greek' => 'windows-1253',
  373. 'turkish' => 'windows-1254',
  374. 'hebrew' => 'windows-1255',
  375. 'arabic' => 'windows-1256',
  376. 'baltic' => 'windows-1257',
  377. 'estonian' => 'windows-1257',
  378. 'lithuanian' => 'windows-1257',
  379. 'vietnamese' => 'windows-1258',
  380. 'thai' => 'cp874',
  381. 'korean' => 'cp949',
  382. 'chinese' => 'gb2312',
  383. 'japanese' => 'shift_jis',
  384. 'simpl_chinese' => 'gb2312',
  385. 'trad_chinese' => 'big5',
  386. 'albanian' => 'windows-1250',
  387. 'unicode' => 'utf-8'
  388. );
  389. // mapping of locale names to charsets
  390. var $locale_to_charset = array(
  391. 'japanese.euc' => 'euc-jp',
  392. 'ja_jp.ujis' => 'euc-jp',
  393. 'korean.euc' => 'euc-kr',
  394. 'sr@Latn' => 'iso-8859-2',
  395. 'zh_cn' => 'gb2312',
  396. 'zh_hk' => 'big5',
  397. 'zh_tw' => 'big5',
  398. );
  399. // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
  400. // Empty values means "iso-8859-1"
  401. var $charSetArray = array(
  402. 'af' => '',
  403. 'ar' => 'iso-8859-6',
  404. 'ba' => 'iso-8859-2',
  405. 'bg' => 'windows-1251',
  406. 'br' => '',
  407. 'ca' => 'iso-8859-15',
  408. 'ch' => 'gb2312',
  409. 'cs' => 'windows-1250',
  410. 'cz' => 'windows-1250',
  411. 'da' => '',
  412. 'de' => '',
  413. 'dk' => '',
  414. 'el' => 'iso-8859-7',
  415. 'eo' => 'utf-8',
  416. 'es' => '',
  417. 'et' => 'iso-8859-4',
  418. 'eu' => '',
  419. 'fa' => 'utf-8',
  420. 'fi' => '',
  421. 'fo' => 'utf-8',
  422. 'fr' => '',
  423. 'fr_CA' => '',
  424. 'ga' => '',
  425. 'ge' => 'utf-8',
  426. 'gl' => '',
  427. 'gr' => 'iso-8859-7',
  428. 'he' => 'utf-8',
  429. 'hi' => 'utf-8',
  430. 'hk' => 'big5',
  431. 'hr' => 'windows-1250',
  432. 'hu' => 'iso-8859-2',
  433. 'is' => 'utf-8',
  434. 'it' => '',
  435. 'ja' => 'shift_jis',
  436. 'jp' => 'shift_jis',
  437. 'ka' => 'utf-8',
  438. 'kl' => 'utf-8',
  439. 'km' => 'utf-8',
  440. 'ko' => 'euc-kr',
  441. 'kr' => 'euc-kr',
  442. 'lt' => 'windows-1257',
  443. 'lv' => 'utf-8',
  444. 'ms' => '',
  445. 'my' => '',
  446. 'nl' => '',
  447. 'no' => '',
  448. 'pl' => 'iso-8859-2',
  449. 'pt' => '',
  450. 'pt_BR' => '',
  451. 'qc' => '',
  452. 'ro' => 'iso-8859-2',
  453. 'ru' => 'windows-1251',
  454. 'se' => '',
  455. 'si' => 'windows-1250',
  456. 'sk' => 'windows-1250',
  457. 'sl' => 'windows-1250',
  458. 'sq' => 'utf-8',
  459. 'sr' => 'utf-8',
  460. 'sv' => '',
  461. 'th' => 'iso-8859-11',
  462. 'tr' => 'iso-8859-9',
  463. 'ua' => 'windows-1251',
  464. 'uk' => 'windows-1251',
  465. 'vi' => 'utf-8',
  466. 'vn' => 'utf-8',
  467. 'zh' => 'big5',
  468. );
  469. // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
  470. // Missing keys means: same as TYPO3
  471. // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
  472. var $isoArray = array(
  473. 'ba' => 'bs',
  474. 'br' => 'pt_BR',
  475. 'ch' => 'zh_CN',
  476. 'cz' => 'cs',
  477. 'dk' => 'da',
  478. 'si' => 'sl',
  479. 'se' => 'sv',
  480. 'gl' => 'kl',
  481. 'gr' => 'el',
  482. 'hk' => 'zh_HK',
  483. 'kr' => 'ko',
  484. 'ua' => 'uk',
  485. 'jp' => 'ja',
  486. 'qc' => 'fr_CA',
  487. 'vn' => 'vi',
  488. 'ge' => 'ka',
  489. 'ga' => 'gl',
  490. );
  491. /**
  492. * Default constructor.
  493. */
  494. public function __construct() {
  495. $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
  496. }
  497. /**
  498. * Normalize - changes input character set to lowercase letters.
  499. *
  500. * @param string Input charset
  501. * @return string Normalized charset
  502. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  503. */
  504. function parse_charset($charset) {
  505. $charset = trim(strtolower($charset));
  506. if (isset($this->synonyms[$charset])) {
  507. $charset = $this->synonyms[$charset];
  508. }
  509. return $charset;
  510. }
  511. /**
  512. * Get the charset of a locale.
  513. *
  514. * ln language
  515. * ln_CN language / country
  516. * ln_CN.cs language / country / charset
  517. * ln_CN.cs@mod language / country / charset / modifier
  518. *
  519. * @param string Locale string
  520. * @return string Charset resolved for locale string
  521. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  522. */
  523. function get_locale_charset($locale) {
  524. $locale = strtolower($locale);
  525. // exact locale specific charset?
  526. if (isset($this->locale_to_charset[$locale])) {
  527. return $this->locale_to_charset[$locale];
  528. }
  529. // get modifier
  530. list($locale, $modifier) = explode('@', $locale);
  531. // locale contains charset: use it
  532. list($locale, $charset) = explode('.', $locale);
  533. if ($charset) {
  534. return $this->parse_charset($charset);
  535. }
  536. // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
  537. if ($modifier == 'euro') {
  538. return 'iso-8859-15';
  539. }
  540. // get language
  541. list($language, $country) = explode('_', $locale);
  542. if (isset($this->lang_to_script[$language])) {
  543. $script = $this->lang_to_script[$language];
  544. }
  545. if (TYPO3_OS == 'WIN') {
  546. $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
  547. } else {
  548. $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
  549. }
  550. return $cs;
  551. }
  552. /********************************************
  553. *
  554. * Charset Conversion functions
  555. *
  556. ********************************************/
  557. /**
  558. * Convert from one charset to another charset.
  559. *
  560. * @param string Input string
  561. * @param string From charset (the current charset of the string)
  562. * @param string To charset (the output charset wanted)
  563. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  564. * @return string Converted string
  565. * @see convArray()
  566. */
  567. function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
  568. if ($fromCS == $toCS) {
  569. return $str;
  570. }
  571. // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
  572. if ($toCS == 'utf-8' || !$useEntityForNoChar) {
  573. switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
  574. case 'mbstring':
  575. $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
  576. if (FALSE !== $conv_str) {
  577. return $conv_str;
  578. } // returns FALSE for unsupported charsets
  579. break;
  580. case 'iconv':
  581. $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
  582. if (FALSE !== $conv_str) {
  583. return $conv_str;
  584. }
  585. break;
  586. case 'recode':
  587. $conv_str = recode_string($fromCS . '..' . $toCS, $str);
  588. if (FALSE !== $conv_str) {
  589. return $conv_str;
  590. }
  591. break;
  592. }
  593. // fallback to TYPO3 conversion
  594. }
  595. if ($fromCS != 'utf-8') {
  596. $str = $this->utf8_encode($str, $fromCS);
  597. }
  598. if ($toCS != 'utf-8') {
  599. $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
  600. }
  601. return $str;
  602. }
  603. /**
  604. * Convert all elements in ARRAY with type string from one charset to another charset.
  605. * NOTICE: Array is passed by reference!
  606. *
  607. * @param string Input array, possibly multidimensional
  608. * @param string From charset (the current charset of the string)
  609. * @param string To charset (the output charset wanted)
  610. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  611. * @return void
  612. * @see conv()
  613. */
  614. function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
  615. foreach ($array as $key => $value) {
  616. if (is_array($array[$key])) {
  617. $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  618. } elseif (is_string($array[$key])) {
  619. $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  620. }
  621. }
  622. }
  623. /**
  624. * Converts $str from $charset to UTF-8
  625. *
  626. * @param string String in local charset to convert to UTF-8
  627. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  628. * @return string Output string, converted to UTF-8
  629. */
  630. function utf8_encode($str, $charset) {
  631. if ($charset === 'utf-8') {
  632. return $str;
  633. }
  634. // Charset is case-insensitive.
  635. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  636. $strLen = strlen($str);
  637. $outStr = '';
  638. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
  639. $chr = substr($str, $a, 1);
  640. $ord = ord($chr);
  641. if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
  642. $ord2 = ord($str{$a + 1});
  643. $ord = $ord << 8 | $ord2; // assume big endian
  644. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  645. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  646. } else {
  647. $outStr .= chr($this->noCharByteVal);
  648. } // No char exists
  649. $a++;
  650. } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
  651. if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
  652. if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
  653. $a++;
  654. $ord2 = ord(substr($str, $a, 1));
  655. $ord = $ord * 256 + $ord2;
  656. }
  657. }
  658. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  659. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  660. } else {
  661. $outStr .= chr($this->noCharByteVal);
  662. } // No char exists
  663. } else {
  664. $outStr .= $chr;
  665. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  666. }
  667. return $outStr;
  668. }
  669. }
  670. /**
  671. * Converts $str from UTF-8 to $charset
  672. *
  673. * @param string String in UTF-8 to convert to local charset
  674. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  675. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  676. * @return string Output string, converted to local charset
  677. */
  678. function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
  679. if ($charset === 'utf-8') {
  680. return $str;
  681. }
  682. // Charset is case-insensitive.
  683. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  684. $strLen = strlen($str);
  685. $outStr = '';
  686. $buf = '';
  687. for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
  688. $chr = substr($str, $a, 1);
  689. $ord = ord($chr);
  690. if ($ord > 127) { // This means multibyte! (first byte!)
  691. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  692. $buf = $chr; // Add first byte
  693. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  694. $ord = $ord << 1; // Shift it left and ...
  695. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  696. $a++; // Increase pointer...
  697. $buf .= substr($str, $a, 1); // ... and add the next char.
  698. } else {
  699. break;
  700. }
  701. }
  702. if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
  703. $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
  704. if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
  705. $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
  706. } else {
  707. $outStr .= chr($mByte);
  708. }
  709. } elseif ($useEntityForNoChar) { // Create num entity:
  710. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  711. } else {
  712. $outStr .= chr($this->noCharByteVal);
  713. } // No char exists
  714. } else {
  715. $outStr .= chr($this->noCharByteVal);
  716. } // No char exists (MIDDLE of MB sequence!)
  717. } else {
  718. $outStr .= $chr;
  719. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  720. }
  721. return $outStr;
  722. }
  723. }
  724. /**
  725. * Converts all chars > 127 to numeric entities.
  726. *
  727. * @param string Input string
  728. * @return string Output string
  729. */
  730. function utf8_to_entities($str) {
  731. $strLen = strlen($str);
  732. $outStr = '';
  733. $buf = '';
  734. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  735. $chr = substr($str, $a, 1);
  736. $ord = ord($chr);
  737. if ($ord > 127) { // This means multibyte! (first byte!)
  738. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  739. $buf = $chr; // Add first byte
  740. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  741. $ord = $ord << 1; // Shift it left and ...
  742. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  743. $a++; // Increase pointer...
  744. $buf .= substr($str, $a, 1); // ... and add the next char.
  745. } else {
  746. break;
  747. }
  748. }
  749. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  750. } else {
  751. $outStr .= chr($this->noCharByteVal);
  752. } // No char exists (MIDDLE of MB sequence!)
  753. } else {
  754. $outStr .= $chr;
  755. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  756. }
  757. return $outStr;
  758. }
  759. /**
  760. * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
  761. *
  762. * @param string Input string, UTF-8
  763. * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
  764. * @return string Output string
  765. */
  766. function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
  767. // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
  768. // see http://php.net/manual/en/function.get-html-translation-table.php
  769. $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
  770. if ($alsoStdHtmlEnt) {
  771. if ($applyPhpCompatibilityFix === TRUE) {
  772. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
  773. } else {
  774. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
  775. }
  776. }
  777. $token = md5(microtime());
  778. $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
  779. foreach ($parts as $k => $v) {
  780. // only take every second element
  781. if ($k % 2 === 0) {
  782. continue;
  783. }
  784. $position = 0;
  785. if (substr($v, $position, 1) == '#') { // Dec or hex entities:
  786. $position++;
  787. if (substr($v, $position, 1) == 'x') {
  788. $v = hexdec(substr($v, ++$position));
  789. } else {
  790. $v = substr($v, $position);
  791. }
  792. $parts[$k] = $this->UnumberToChar($v);
  793. } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
  794. $v = $trans_tbl['&' . $v . ';'];
  795. if ($applyPhpCompatibilityFix === TRUE) {
  796. $v = $this->utf8_encode($v, 'iso-8859-1');
  797. }
  798. $parts[$k] = $v;
  799. } else { // No conversion:
  800. $parts[$k] = '&' . $v . ';';
  801. }
  802. }
  803. return implode('', $parts);
  804. }
  805. /**
  806. * Converts all chars in the input UTF-8 string into integer numbers returned in an array
  807. *
  808. * @param string Input string, UTF-8
  809. * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
  810. * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
  811. * @return array Output array with the char numbers
  812. */
  813. function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
  814. // If entities must be registered as well...:
  815. if ($convEntities) {
  816. $str = $this->entities_to_utf8($str, 1);
  817. }
  818. // Do conversion:
  819. $strLen = strlen($str);
  820. $outArr = array();
  821. $buf = '';
  822. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  823. $chr = substr($str, $a, 1);
  824. $ord = ord($chr);
  825. if ($ord > 127) { // This means multibyte! (first byte!)
  826. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  827. $buf = $chr; // Add first byte
  828. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  829. $ord = $ord << 1; // Shift it left and ...
  830. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  831. $a++; // Increase pointer...
  832. $buf .= substr($str, $a, 1); // ... and add the next char.
  833. } else {
  834. break;
  835. }
  836. }
  837. $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
  838. } else {
  839. $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
  840. } // No char exists (MIDDLE of MB sequence!)
  841. } else {
  842. $outArr[] = $retChar ? chr($ord) : $ord;
  843. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  844. }
  845. return $outArr;
  846. }
  847. /**
  848. * Converts a UNICODE number to a UTF-8 multibyte character
  849. * Algorithm based on script found at From: http://czyborra.com/utf/
  850. * Unit-tested by Kasper
  851. *
  852. * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
  853. *
  854. * bytes | bits | representation
  855. * 1 | 7 | 0vvvvvvv
  856. * 2 | 11 | 110vvvvv 10vvvvvv
  857. * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
  858. * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  859. * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  860. * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  861. *
  862. * @param integer UNICODE integer
  863. * @return string UTF-8 multibyte character string
  864. * @see utf8CharToUnumber()
  865. */
  866. function UnumberToChar($cbyte) {
  867. $str = '';
  868. if ($cbyte < 0x80) {
  869. $str .= chr($cbyte);
  870. } else {
  871. if ($cbyte < 0x800) {
  872. $str .= chr(0xC0 | ($cbyte >> 6));
  873. $str .= chr(0x80 | ($cbyte & 0x3F));
  874. } else {
  875. if ($cbyte < 0x10000) {
  876. $str .= chr(0xE0 | ($cbyte >> 12));
  877. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  878. $str .= chr(0x80 | ($cbyte & 0x3F));
  879. } else {
  880. if ($cbyte < 0x200000) {
  881. $str .= chr(0xF0 | ($cbyte >> 18));
  882. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  883. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  884. $str .= chr(0x80 | ($cbyte & 0x3F));
  885. } else {
  886. if ($cbyte < 0x4000000) {
  887. $str .= chr(0xF8 | ($cbyte >> 24));
  888. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  889. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  890. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  891. $str .= chr(0x80 | ($cbyte & 0x3F));
  892. } else {
  893. if ($cbyte < 0x80000000) {
  894. $str .= chr(0xFC | ($cbyte >> 30));
  895. $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
  896. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  897. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  898. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  899. $str .= chr(0x80 | ($cbyte & 0x3F));
  900. } else { // Cannot express a 32-bit character in UTF-8
  901. $str .= chr($this->noCharByteVal);
  902. }
  903. }
  904. }
  905. }
  906. }
  907. }
  908. return $str;
  909. }
  910. /**
  911. * Converts a UTF-8 Multibyte character to a UNICODE number
  912. * Unit-tested by Kasper
  913. *
  914. * @param string UTF-8 multibyte character string
  915. * @param boolean If set, then a hex. number is returned.
  916. * @return integer UNICODE integer
  917. * @see UnumberToChar()
  918. */
  919. function utf8CharToUnumber($str, $hex = 0) {
  920. $ord = ord(substr($str, 0, 1)); // First char
  921. if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
  922. $binBuf = '';
  923. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  924. $ord = $ord << 1; // Shift it left and ...
  925. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  926. $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
  927. } else {
  928. break;
  929. }
  930. }
  931. $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
  932. $int = bindec($binBuf);
  933. } else {
  934. $int = $ord;
  935. }
  936. return $hex ? 'x' . dechex($int) : $int;
  937. }
  938. /********************************************
  939. *
  940. * Init functions
  941. *
  942. ********************************************/
  943. /**
  944. * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
  945. * This function is automatically called by the conversion functions
  946. *
  947. * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
  948. *
  949. * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
  950. * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
  951. * @access private
  952. */
  953. function initCharset($charset) {
  954. // Only process if the charset is not yet loaded:
  955. if (!is_array($this->parsedCharsets[$charset])) {
  956. // Conversion table filename:
  957. $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
  958. // If the conversion table is found:
  959. if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
  960. // Cache file for charsets:
  961. // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
  962. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
  963. if ($cacheFile && @is_file($cacheFile)) {
  964. $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  965. } else {
  966. // Parse conversion table into lines:
  967. $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
  968. // Initialize the internal variable holding the conv. table:
  969. $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
  970. // traverse the lines:
  971. $detectedType = '';
  972. foreach ($lines as $value) {
  973. if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
  974. // Detect type if not done yet: (Done on first real line)
  975. // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
  976. if (!$detectedType) {
  977. $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
  978. }
  979. if ($detectedType == 'ms-token') {
  980. list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
  981. } elseif ($detectedType == 'whitespaced') {
  982. $regA = array();
  983. preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
  984. $hexbyte = $regA[1];
  985. $utf8 = 'U+' . $regA[2];
  986. }
  987. $decval = hexdec(trim($hexbyte));
  988. if ($decval > 127) {
  989. $utf8decval = hexdec(substr(trim($utf8), 2));
  990. $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
  991. $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
  992. }
  993. }
  994. }
  995. if ($cacheFile) {
  996. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
  997. }
  998. }
  999. return 2;
  1000. } else {
  1001. return FALSE;
  1002. }
  1003. } else {
  1004. return 1;
  1005. }
  1006. }
  1007. /**
  1008. * This function initializes all UTF-8 character data tables.
  1009. *
  1010. * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
  1011. *
  1012. * @param string Mode ("case", "ascii", ...)
  1013. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1014. * @access private
  1015. */
  1016. function initUnicodeData($mode = NULL) {
  1017. // cache files
  1018. $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
  1019. $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
  1020. // Only process if the tables are not yet loaded
  1021. switch ($mode) {
  1022. case 'case':
  1023. if (is_array($this->caseFolding['utf-8'])) {
  1024. return 1;
  1025. }
  1026. // Use cached version if possible
  1027. if ($cacheFileCase && @is_file($cacheFileCase)) {
  1028. $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
  1029. return 2;
  1030. }
  1031. break;
  1032. case 'ascii':
  1033. if (is_array($this->toASCII['utf-8'])) {
  1034. return 1;
  1035. }
  1036. // Use cached version if possible
  1037. if ($cacheFileASCII && @is_file($cacheFileASCII)) {
  1038. $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
  1039. return 2;
  1040. }
  1041. break;
  1042. }
  1043. // process main Unicode data file
  1044. $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
  1045. if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
  1046. return FALSE;
  1047. }
  1048. $fh = fopen($unicodeDataFile, 'rb');
  1049. if (!$fh) {
  1050. return FALSE;
  1051. }
  1052. // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
  1053. // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
  1054. $this->caseFolding['utf-8'] = array();
  1055. $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
  1056. $utf8CaseFolding['toUpper'] = array();
  1057. $utf8CaseFolding['toLower'] = array();
  1058. $utf8CaseFolding['toTitle'] = array();
  1059. $decomposition = array(); // array of temp. decompositions
  1060. $mark = array(); // array of chars that are marks (eg. composing accents)
  1061. $number = array(); // array of chars that are numbers (eg. digits)
  1062. $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
  1063. while (!feof($fh)) {
  1064. $line = fgets($fh, 4096);
  1065. // has a lot of info
  1066. list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
  1067. $ord = hexdec($char);
  1068. if ($ord > 0xFFFF) {
  1069. break;
  1070. } // only process the BMP
  1071. $utf8_char = $this->UnumberToChar($ord);
  1072. if ($upper) {
  1073. $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
  1074. }
  1075. if ($lower) {
  1076. $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
  1077. }
  1078. // store "title" only when different from "upper" (only a few)
  1079. if ($title && $title != $upper) {
  1080. $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
  1081. }
  1082. switch ($cat{0}) {
  1083. case 'M': // mark (accent, umlaut, ...)
  1084. $mark["U+$char"] = 1;
  1085. break;
  1086. case 'N': // numeric value
  1087. if ($ord > 0x80 && $num != '') {
  1088. $number["U+$char"] = $num;
  1089. }
  1090. }
  1091. // accented Latin letters without "official" decomposition
  1092. $match = array();
  1093. if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
  1094. $c = ord($match[2]);
  1095. if ($match[1] == 'SMALL') {
  1096. $c += 32;
  1097. }
  1098. $decomposition["U+$char"] = array(dechex($c));
  1099. continue;
  1100. }
  1101. $match = array();
  1102. if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
  1103. switch ($match[1]) {
  1104. case '<circle>': // add parenthesis as circle replacement, eg (1)
  1105. $match[2] = '0028 ' . $match[2] . ' 0029';
  1106. break;
  1107. case '<square>': // add square brackets as square replacement, eg [1]
  1108. $match[2] = '005B ' . $match[2] . ' 005D';
  1109. break;
  1110. case '<compat>': // ignore multi char decompositions that start with a space
  1111. if (preg_match('/^0020 /', $match[2])) {
  1112. continue 2;
  1113. }
  1114. break;
  1115. // ignore Arabic and vertical layout presentation decomposition
  1116. case '<initial>':
  1117. case '<medial>':
  1118. case '<final>':
  1119. case '<isolated>':
  1120. case '<vertical>':
  1121. continue 2;
  1122. }
  1123. $decomposition["U+$char"] = explode(' ', $match[2]);
  1124. }
  1125. }
  1126. fclose($fh);
  1127. // process additional Unicode data for casing (allow folded characters to expand into a sequence)
  1128. $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
  1129. if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
  1130. $fh = fopen($specialCasingFile, 'rb');
  1131. if ($fh) {
  1132. while (!feof($fh)) {
  1133. $line = fgets($fh, 4096);
  1134. if ($line{0} != '#' && trim($line) != '') {
  1135. list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
  1136. if ($cond == '' || $cond{0} == '#') {
  1137. $utf8_char = $this->UnumberToChar(hexdec($char));
  1138. if ($char != $lower) {
  1139. $arr = explode(' ', $lower);
  1140. for ($i = 0; isset($arr[$i]); $i++) {
  1141. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1142. }
  1143. $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
  1144. }
  1145. if ($char != $title && $title != $upper) {
  1146. $arr = explode(' ', $title);
  1147. for ($i = 0; isset($arr[$i]); $i++) {
  1148. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1149. }
  1150. $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
  1151. }
  1152. if ($char != $upper) {
  1153. $arr = explode(' ', $upper);
  1154. for ($i = 0; isset($arr[$i]); $i++) {
  1155. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1156. }
  1157. $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
  1158. }
  1159. }
  1160. }
  1161. }
  1162. fclose($fh);
  1163. }
  1164. }
  1165. // process custom decompositions
  1166. $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
  1167. if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
  1168. $fh = fopen($customTranslitFile, 'rb');
  1169. if ($fh) {
  1170. while (!feof($fh)) {
  1171. $line = fgets($fh, 4096);
  1172. if ($line{0} != '#' && trim($line) != '') {
  1173. list($char, $translit) = t3lib_div::trimExplode(';', $line);
  1174. if (!$translit) {
  1175. $omit["U+$char"] = 1;
  1176. }
  1177. $decomposition["U+$char"] = explode(' ', $translit);
  1178. }
  1179. }
  1180. fclose($fh);
  1181. }
  1182. }
  1183. // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
  1184. foreach ($decomposition as $from => $to) {
  1185. $code_decomp = array();
  1186. while ($code_value = array_shift($to)) {
  1187. if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
  1188. foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
  1189. array_unshift($to, $cv);
  1190. }
  1191. } elseif (!isset($mark["U+$code_value"])) { // remove mark
  1192. array_push($code_decomp, $code_value);
  1193. }
  1194. }
  1195. if (count($code_decomp) || isset($omit[$from])) {
  1196. $decomposition[$from] = $code_decomp;
  1197. } else {
  1198. unset($decomposition[$from]);
  1199. }
  1200. }
  1201. // create ascii only mapping
  1202. $this->toASCII['utf-8'] = array();
  1203. $ascii =& $this->toASCII['utf-8'];
  1204. foreach ($decomposition as $from => $to) {
  1205. $code_decomp = array();
  1206. while ($code_value = array_shift($to)) {
  1207. $ord = hexdec($code_value);
  1208. if ($ord > 127) {
  1209. continue 2;
  1210. } // skip decompositions containing non-ASCII chars
  1211. else
  1212. {
  1213. array_push($code_decomp, chr($ord));
  1214. }
  1215. }
  1216. $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
  1217. }
  1218. // add numeric decompositions
  1219. foreach ($number as $from => $to) {
  1220. $utf8_char = $this->UnumberToChar(hexdec($from));
  1221. if (!isset($ascii[$utf8_char])) {
  1222. $ascii[$utf8_char] = $to;
  1223. }
  1224. }
  1225. if ($cacheFileCase) {
  1226. t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
  1227. }
  1228. if ($cacheFileASCII) {
  1229. t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
  1230. }
  1231. return 3;
  1232. }
  1233. /**
  1234. * This function initializes the folding table for a charset other than UTF-8.
  1235. * This function is automatically called by the case folding functions.
  1236. *
  1237. * @param string Charset for which to initialize case folding.
  1238. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1239. * @access private
  1240. */
  1241. function initCaseFolding($charset) {
  1242. // Only process if the case table is not yet loaded:
  1243. if (is_array($this->caseFolding[$charset])) {
  1244. return 1;
  1245. }
  1246. // Use cached version if possible
  1247. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
  1248. if ($cacheFile && @is_file($cacheFile)) {
  1249. $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1250. return 2;
  1251. }
  1252. // init UTF-8 conversion for this charset
  1253. if (!$this->initCharset($charset)) {
  1254. return FALSE;
  1255. }
  1256. // UTF-8 case folding is used as the base conversion table
  1257. if (!$this->initUnicodeData('case')) {
  1258. return FALSE;
  1259. }
  1260. $nochar = chr($this->noCharByteVal);
  1261. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1262. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1263. $c = $this->utf8_decode($utf8, $charset);
  1264. // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
  1265. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
  1266. if ($cc != '' && $cc != $nochar) {
  1267. $this->caseFolding[$charset]['toUpper'][$c] = $cc;
  1268. }
  1269. // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
  1270. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
  1271. if ($cc != '' && $cc != $nochar) {
  1272. $this->caseFolding[$charset]['toLower'][$c] = $cc;
  1273. }
  1274. // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
  1275. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
  1276. if ($cc != '' && $cc != $nochar) {
  1277. $this->caseFolding[$charset]['toTitle'][$c] = $cc;
  1278. }
  1279. }
  1280. // add the ASCII case table
  1281. for ($i = ord('a'); $i <= ord('z'); $i++) {
  1282. $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
  1283. }
  1284. for ($i = ord('A'); $i <= ord('Z'); $i++) {
  1285. $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
  1286. }
  1287. if ($cacheFile) {
  1288. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
  1289. }
  1290. return 3;
  1291. }
  1292. /**
  1293. * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
  1294. * This function is automatically called by the ASCII transliteration functions.
  1295. *
  1296. * @param string Charset for which to initialize conversion.
  1297. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1298. * @access private
  1299. */
  1300. function initToASCII($charset) {
  1301. // Only process if the case table is not yet loaded:
  1302. if (is_array($this->toASCII[$charset])) {
  1303. return 1;
  1304. }
  1305. // Use cached version if possible
  1306. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
  1307. if ($cacheFile && @is_file($cacheFile)) {
  1308. $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1309. return 2;
  1310. }
  1311. // init UTF-8 conversion for this charset
  1312. if (!$this->initCharset($charset)) {
  1313. return FALSE;
  1314. }
  1315. // UTF-8/ASCII transliteration is used as the base conversion table
  1316. if (!$this->initUnicodeData('ascii')) {
  1317. return FALSE;
  1318. }
  1319. $nochar = chr($this->noCharByteVal);
  1320. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1321. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1322. $c = $this->utf8_decode($utf8, $charset);
  1323. if (isset($this->toASCII['utf-8'][$utf8])) {
  1324. $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
  1325. }
  1326. }
  1327. if ($cacheFile) {
  1328. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
  1329. }
  1330. return 3;
  1331. }
  1332. /********************************************
  1333. *
  1334. * String operation functions
  1335. *
  1336. ********************************************/
  1337. /**
  1338. * Returns a part of a string.
  1339. * Unit-tested by Kasper (single byte charsets only)
  1340. *
  1341. * @param string The character set
  1342. * @param string Character string
  1343. * @param integer Start position (character position)
  1344. * @param integer Length (in characters)
  1345. * @return string The substring
  1346. * @see substr(), mb_substr()
  1347. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1348. */
  1349. function substr($charset, $string, $start, $len = NULL) {
  1350. if ($len === 0 || $string === '') {
  1351. return '';
  1352. }
  1353. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1354. // cannot omit $len, when specifying charset
  1355. if ($len == NULL) {
  1356. $enc = mb_internal_encoding(); // save internal encoding
  1357. mb_internal_encoding($charset);
  1358. $str = mb_substr($string, $start);
  1359. mb_internal_encoding($enc); // restore internal encoding
  1360. return $str;
  1361. }
  1362. else {
  1363. return mb_substr($string, $start, $len, $charset);
  1364. }
  1365. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1366. // cannot omit $len, when specifying charset
  1367. if ($len == NULL) {
  1368. $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
  1369. iconv_set_encoding('internal_encoding', $charset);
  1370. $str = iconv_substr($string, $start);
  1371. iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
  1372. return $str;
  1373. }
  1374. else {
  1375. return iconv_substr($string, $start, $len, $charset);
  1376. }
  1377. } elseif ($charset == 'utf-8') {
  1378. return $this->utf8_substr($string, $start, $len);
  1379. } elseif ($this->eucBasedSets[$charset]) {
  1380. return $this->euc_substr($string, $start, $charset, $len);
  1381. } elseif ($this->twoByteSets[$charset]) {
  1382. return substr($string, $start * 2, $len * 2);
  1383. } elseif ($this->fourByteSets[$charset]) {
  1384. return substr($string, $start * 4, $len * 4);
  1385. }
  1386. // treat everything else as single-byte encoding
  1387. return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
  1388. }
  1389. /**
  1390. * Counts the number of characters.
  1391. * Unit-tested by Kasper (single byte charsets only)
  1392. *
  1393. * @param string The character set
  1394. * @param string Character string
  1395. * @return integer The number of characters
  1396. * @see strlen()
  1397. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1398. */
  1399. function strlen($charset, $string) {
  1400. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1401. return mb_strlen($string, $charset);
  1402. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1403. return iconv_strlen($string, $charset);
  1404. } elseif ($charset == 'utf-8') {
  1405. return $this->utf8_strlen($string);
  1406. } elseif ($this->eucBasedSets[$charset]) {
  1407. return $this->euc_strlen($string, $charset);
  1408. } elseif ($this->twoByteSets[$charset]) {
  1409. return strlen($string) / 2;
  1410. } elseif ($this->fourByteSets[$charset]) {
  1411. return strlen($string) / 4;
  1412. }
  1413. // treat everything else as single-byte encoding
  1414. return strlen($string);
  1415. }
  1416. /**
  1417. * Method to crop strings using the mb_substr function.
  1418. *
  1419. * @param string The character set
  1420. * @param string String to be cropped
  1421. * @param integer Crop length (in characters)
  1422. * @param string Crop signifier
  1423. * @return string The shortened string
  1424. * @see mb_strlen(), mb_substr()
  1425. */
  1426. protected function cropMbstring($charset, $string, $len, $crop = '') {
  1427. if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
  1428. return $string;
  1429. }
  1430. if ($len > 0) {
  1431. $string = mb_substr($string, 0, $len, $charset) . $crop;
  1432. } else {
  1433. $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
  1434. }
  1435. return $string;
  1436. }
  1437. /**
  1438. * Truncates a string and pre-/appends a string.
  1439. * Unit tested by Kasper
  1440. *
  1441. * @param string The character set
  1442. * @param string Character string
  1443. * @param integer Length (in characters)
  1444. * @param string Crop signifier
  1445. * @return string The shortened string
  1446. * @see substr(), mb_strimwidth()
  1447. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1448. */
  1449. function crop($charset, $string, $len, $crop = '') {
  1450. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1451. return $this->cropMbstring($charset, $string, $len, $crop);
  1452. }
  1453. if (intval($len) == 0) {
  1454. return $string;
  1455. }
  1456. if ($charset == 'utf-8') {
  1457. $i = $this->utf8_char2byte_pos($string, $len);
  1458. } elseif ($this->eucBasedSets[$charset]) {
  1459. $i = $this->euc_char2byte_pos($string, $len, $charset);
  1460. } else {
  1461. if ($len > 0) {
  1462. $i = $len;
  1463. } else {
  1464. $i = strlen($string) + $len;
  1465. if ($i <= 0) {
  1466. $i = FALSE;
  1467. }
  1468. }
  1469. }
  1470. if ($i === FALSE) { // $len outside actual string length
  1471. return $string;
  1472. } else {
  1473. if ($len > 0) {
  1474. if (strlen($string{$i})) {
  1475. return substr($string, 0, $i) . $crop;
  1476. }
  1477. } else {
  1478. if (strlen($string{$i - 1})) {
  1479. return $crop . substr($string, $i);
  1480. }
  1481. }
  1482. /*
  1483. if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
  1484. if ($len > 0) {
  1485. return substr($string,0,$i).$crop;
  1486. } else {
  1487. return $crop.substr($string,$i);
  1488. }
  1489. }
  1490. */
  1491. }
  1492. return $string;
  1493. }
  1494. /**
  1495. * Cuts a string short at a given byte length.
  1496. *
  1497. * @param string The character set
  1498. * @param string Character string
  1499. * @param integer The byte length
  1500. * @return string The shortened string
  1501. * @see mb_strcut()
  1502. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1503. */
  1504. function strtrunc($charset, $string, $len) {
  1505. if ($len <= 0) {
  1506. return '';
  1507. }
  1508. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1509. return mb_strcut($string, 0, $len, $charset);
  1510. } elseif ($charset == 'utf-8') {
  1511. return $this->utf8_strtrunc($string, $len);
  1512. } elseif ($this->eucBasedSets[$charset]) {
  1513. return $this->euc_strtrunc($string, $len, $charset);
  1514. } elseif ($this->twoByteSets[$charset]) {
  1515. if ($len % 2) {
  1516. $len--;
  1517. } // don't cut at odd positions
  1518. } elseif ($this->fourByteSets[$charset]) {
  1519. $x = $len % 4;
  1520. $len -= $x; // realign to position dividable by four
  1521. }
  1522. // treat everything else as single-byte encoding
  1523. return substr($string, 0, $len);
  1524. }
  1525. /**
  1526. * Translates all characters of a string into their respective case values.
  1527. * Unlike strtolower() and strtoupper() this method is locale independent.
  1528. * Note that the string length may change!
  1529. * eg. lower case German "ß" (sharp S) becomes upper case "SS"
  1530. * Unit-tested by Kasper
  1531. * Real case folding is language dependent, this method ignores this fact.
  1532. *
  1533. * @param string Character set of string
  1534. * @param string Input string to convert case for
  1535. * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
  1536. * @return string The converted string
  1537. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1538. * @see strtolower(), strtoupper()
  1539. */
  1540. function conv_case($charset, $string, $case) {
  1541. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1542. if ($case == 'toLower') {
  1543. $string = mb_strtolower($string, $charset);
  1544. } else {
  1545. $string = mb_strtoupper($string, $charset);
  1546. }
  1547. } elseif ($charset == 'utf-8') {
  1548. $string = $this->utf8_char_mapping($string, 'case', $case);
  1549. } elseif (isset($this->eucBasedSets[$charset])) {
  1550. $string = $this->euc_char_mapping($string, $charset, 'case', $case);
  1551. } else {
  1552. // treat everything else as single-byte encoding
  1553. $string = $this->sb_char_mapping($string, $charset, 'case', $case);
  1554. }
  1555. return $string;
  1556. }
  1557. /**
  1558. * Equivalent of lcfirst/ucfirst but using character set.
  1559. *
  1560. * @param string $charset
  1561. * @param string $string
  1562. * @param string $case
  1563. * @return string
  1564. * @see t3lib_cs::conv_case()
  1565. */
  1566. public function convCaseFirst($charset, $string, $case) {
  1567. $firstChar = $this->substr($charset, $string, 0, 1);
  1568. $firstChar = $this->conv_case($charset, $firstChar, $case);
  1569. $remainder = $this->substr($charset, $string, 1);
  1570. return $firstChar . $remainder;
  1571. }
  1572. /**
  1573. * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
  1574. *
  1575. * @param string $charset Character set of string
  1576. * @param string $string Input string to convert
  1577. * @return string The converted string
  1578. */
  1579. function specCharsToASCII($charset, $string) {
  1580. if ($charset == 'utf-8') {
  1581. $string = $this->utf8_char_mapping($string, 'ascii');
  1582. } elseif (isset($this->eucBasedSets[$charset])) {
  1583. $string = $this->euc_char_mapping($string, $charset, 'ascii');
  1584. } else {
  1585. // treat everything else as single-byte encoding
  1586. $string = $this->sb_char_mapping($string, $charset, 'ascii');
  1587. }
  1588. return $string;
  1589. }
  1590. /**
  1591. * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
  1592. * into a TYPO3-readable language code
  1593. * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
  1594. * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
  1595. * @return string a preferred language that TYPO3 supports, or "default" if none found
  1596. * @author Benjamin Mack (benni.typo3.org)
  1597. */
  1598. public function getPreferredClientLanguage($languageCodesList) {
  1599. $allLanguageCodes = array();
  1600. $selectedLanguage = 'default';
  1601. // get all languages where TYPO3 code is the same as the ISO code
  1602. foreach ($this->charSetArray as $typo3Lang => $charSet) {
  1603. $allLanguageCodes[$typo3Lang] = $typo3Lang;
  1604. }
  1605. // get all languages where TYPO3 code differs from ISO code
  1606. // or needs the country part
  1607. // the iso codes will here overwrite the default typo3 language in the key
  1608. foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
  1609. $isoLang = join('-', explode('_', $isoLang));
  1610. $allLanguageCodes[$typo3Lang] = $isoLang;
  1611. }
  1612. // move the iso codes to the (because we're comparing the keys with "isset" later on)
  1613. $allLanguageCodes = array_flip($allLanguageCodes);
  1614. $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
  1615. // order the preferred languages after they key
  1616. $sortedPreferredLanguages = array();
  1617. foreach ($preferredLanguages as $preferredLanguage) {
  1618. $quality = 1.0;
  1619. if (strpos($preferredLanguage, ';q=') !== FALSE) {
  1620. list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
  1621. }
  1622. $sortedPreferredLanguages[$preferredLanguage] = $quality;
  1623. }
  1624. // loop through the languages, with the highest priority first
  1625. arsort($sortedPreferredLanguages, SORT_NUMERIC);
  1626. foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
  1627. if (isset($allLanguageCodes[$preferredLanguage])) {
  1628. $selectedLanguage = $allLanguageCodes[$preferredLanguage];
  1629. break;
  1630. }
  1631. // strip the country code from the end
  1632. list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
  1633. if (isset($allLanguageCodes[$preferredLanguage])) {
  1634. $selectedLanguage = $allLanguageCodes[$preferredLanguage];
  1635. break;
  1636. }
  1637. }
  1638. if (!$selectedLanguage || $selectedLanguage == 'en') {
  1639. $selectedLanguage = 'default';
  1640. }
  1641. return $selectedLanguage;
  1642. }
  1643. /********************************************
  1644. *
  1645. * Internal string operation functions
  1646. *
  1647. ********************************************/
  1648. /**
  1649. * Maps all characters of a string in a single byte charset.
  1650. *
  1651. * @param string the string
  1652. * @param string the charset
  1653. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  1654. * @param string 'case': conversion 'toLower' or 'toUpper'
  1655. * @return string the converted string
  1656. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1657. */
  1658. function sb_char_mapping($str, $charset, $mode, $opt = '') {
  1659. switch ($mode) {
  1660. case 'case':
  1661. if (!$this->initCaseFolding($charset)) {
  1662. return $str;
  1663. } // do nothing
  1664. $map =& $this->caseFolding[$charset][$opt];
  1665. break;
  1666. case 'ascii':
  1667. if (!$this->initToASCII($charset)) {
  1668. return $str;
  1669. } // do nothing
  1670. $map =& $this->toASCII[$charset];
  1671. break;
  1672. default:
  1673. return $str;
  1674. }
  1675. $out = '';
  1676. for ($i = 0; strlen($str{$i}); $i++) {
  1677. $c = $str{$i};
  1678. if (isset($map[$c])) {
  1679. $out .= $map[$c];
  1680. } else {
  1681. $out .= $c;
  1682. }
  1683. }
  1684. return $out;
  1685. }
  1686. /********************************************
  1687. *
  1688. * Internal UTF-8 string operation functions
  1689. *
  1690. ********************************************/
  1691. /**
  1692. * Returns a part of a UTF-8 string.
  1693. * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
  1694. *
  1695. * @param string UTF-8 string
  1696. * @param integer Start position (character position)
  1697. * @param integer Length (in characters)
  1698. * @return string The substring
  1699. * @see substr()
  1700. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1701. */
  1702. function utf8_substr($str, $start, $len = NULL) {
  1703. if (!strcmp($len, '0')) {
  1704. return '';
  1705. }
  1706. $byte_start = $this->utf8_char2byte_pos($str, $start);
  1707. if ($byte_start === FALSE) {
  1708. if ($start > 0) {
  1709. return FALSE; // $start outside string length
  1710. } else {
  1711. $start = 0;
  1712. }
  1713. }
  1714. $str = substr($str, $byte_start);
  1715. if ($len != NULL) {
  1716. $byte_end = $this->utf8_char2byte_pos($str, $len);
  1717. if ($byte_end === FALSE) // $len outside actual string length
  1718. {
  1719. return $len < 0 ? '' : $str;
  1720. } // When length is less than zero and exceeds, then we return blank string.
  1721. else
  1722. {
  1723. return substr($str, 0, $byte_end);
  1724. }
  1725. }
  1726. else {
  1727. return $str;
  1728. }
  1729. }
  1730. /**
  1731. * Counts the number of characters of a string in UTF-8.
  1732. * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
  1733. *
  1734. * @param string UTF-8 multibyte character string
  1735. * @return integer The number of characters
  1736. * @see strlen()
  1737. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1738. */
  1739. function utf8_strlen($str) {
  1740. $n = 0;
  1741. for ($i = 0; strlen($str{$i}); $i++) {
  1742. $c = ord($str{$i});
  1743. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1744. {
  1745. $n++;
  1746. }
  1747. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1748. {
  1749. $n++;
  1750. }
  1751. }
  1752. return $n;
  1753. }
  1754. /**
  1755. * Truncates a string in UTF-8 short at a given byte length.
  1756. *
  1757. * @param string UTF-8 multibyte character string
  1758. * @param integer the byte length
  1759. * @return string the shortened string
  1760. * @see mb_strcut()
  1761. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1762. */
  1763. function utf8_strtrunc($str, $len) {
  1764. $i = $len - 1;
  1765. if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
  1766. for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
  1767. // find the first byte
  1768. ;
  1769. }
  1770. if ($i <= 0) {
  1771. return '';
  1772. } // sanity check
  1773. for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
  1774. // calculate number of bytes
  1775. $bc++;
  1776. }
  1777. if ($bc + $i > $len) {
  1778. return substr($str, 0, $i);
  1779. }
  1780. // fallthru: multibyte char fits into length
  1781. }
  1782. return substr($str, 0, $len);
  1783. }
  1784. /**
  1785. * Find position of first occurrence of a string, both arguments are in UTF-8.
  1786. *
  1787. * @param string UTF-8 string to search in
  1788. * @param string UTF-8 string to search for
  1789. * @param integer Positition to start the search
  1790. * @return integer The character position
  1791. * @see strpos()
  1792. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1793. */
  1794. function utf8_strpos($haystack, $needle, $offset = 0) {
  1795. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1796. return mb_strpos($haystack, $needle, $offset, 'utf-8');
  1797. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1798. return iconv_strpos($haystack, $needle, $offset, 'utf-8');
  1799. }
  1800. $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
  1801. if ($byte_offset === FALSE) {
  1802. return FALSE;
  1803. } // offset beyond string length
  1804. $byte_pos = strpos($haystack, $needle, $byte_offset);
  1805. if ($byte_pos === FALSE) {
  1806. return FALSE;
  1807. } // needle not found
  1808. return $this->utf8_byte2char_pos($haystack, $byte_pos);
  1809. }
  1810. /**
  1811. * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
  1812. *
  1813. * @param string UTF-8 string to search in
  1814. * @param string UTF-8 character to search for (single character)
  1815. * @return integer The character position
  1816. * @see strrpos()
  1817. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1818. */
  1819. function utf8_strrpos($haystack, $needle) {
  1820. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1821. return mb_strrpos($haystack, $needle, 'utf-8');
  1822. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1823. return iconv_strrpos($haystack, $needle, 'utf-8');
  1824. }
  1825. $byte_pos = strrpos($haystack, $needle);
  1826. if ($byte_pos === FALSE) {
  1827. return FALSE;
  1828. } // needle not found
  1829. return $this->utf8_byte2char_pos($haystack, $byte_pos);
  1830. }
  1831. /**
  1832. * Translates a character position into an 'absolute' byte position.
  1833. * Unit tested by Kasper.
  1834. *
  1835. * @param string UTF-8 string
  1836. * @param integer Character position (negative values start from the end)
  1837. * @return integer Byte position
  1838. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1839. */
  1840. function utf8_char2byte_pos($str, $pos) {
  1841. $n = 0; // number of characters found
  1842. $p = abs($pos); // number of characters wanted
  1843. if ($pos >= 0) {
  1844. $i = 0;
  1845. $d = 1;
  1846. } else {
  1847. $i = strlen($str) - 1;
  1848. $d = -1;
  1849. }
  1850. for (; strlen($str{$i}) && $n < $p; $i += $d) {
  1851. $c = (int) ord($str{$i});
  1852. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1853. {
  1854. $n++;
  1855. }
  1856. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1857. {
  1858. $n++;
  1859. }
  1860. }
  1861. if (!strlen($str{$i})) {
  1862. return FALSE;
  1863. } // offset beyond string length
  1864. if ($pos >= 0) {
  1865. // skip trailing multi-byte data bytes
  1866. while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
  1867. $i++;
  1868. }
  1869. } else {
  1870. // correct offset
  1871. $i++;
  1872. }
  1873. return $i;
  1874. }
  1875. /**
  1876. * Translates an 'absolute' byte position into a character position.
  1877. * Unit tested by Kasper.
  1878. *
  1879. * @param string UTF-8 string
  1880. * @param integer byte position
  1881. * @return integer character position
  1882. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1883. */
  1884. function utf8_byte2char_pos($str, $pos) {
  1885. $n = 0; // number of characters
  1886. for ($i = $pos; $i > 0; $i--) {
  1887. $c = (int) ord($str{$i});
  1888. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1889. {
  1890. $n++;
  1891. }
  1892. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1893. {
  1894. $n++;
  1895. }
  1896. }
  1897. if (!strlen($str{$i})) {
  1898. return FALSE;
  1899. } // offset beyond string length
  1900. return $n;
  1901. }
  1902. /**
  1903. * Maps all characters of an UTF-8 string.
  1904. *
  1905. * @param string UTF-8 string
  1906. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  1907. * @param string 'case': conversion 'toLower' or 'toUpper'
  1908. * @return string the converted string
  1909. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1910. */
  1911. function utf8_char_mapping($str, $mode, $opt = '') {
  1912. if (!$this->initUnicodeData($mode)) {
  1913. return $str;
  1914. } // do nothing
  1915. $out = '';
  1916. switch ($mode) {
  1917. case 'case':
  1918. $map =& $this->caseFolding['utf-8'][$opt];
  1919. break;
  1920. case 'ascii':
  1921. $map =& $this->toASCII['utf-8'];
  1922. break;
  1923. default:
  1924. return $str;
  1925. }
  1926. for ($i = 0; strlen($str{$i}); $i++) {
  1927. $c = ord($str{$i});
  1928. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1929. {
  1930. $mbc = $str{$i};
  1931. }
  1932. elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
  1933. for ($bc = 0; $c & 0x80; $c = $c << 1) {
  1934. $bc++;
  1935. } // calculate number of bytes
  1936. $mbc = substr($str, $i, $bc);
  1937. $i += $bc - 1;
  1938. }
  1939. if (isset($map[$mbc])) {
  1940. $out .= $map[$mbc];
  1941. } else {
  1942. $out .= $mbc;
  1943. }
  1944. }
  1945. return $out;
  1946. }
  1947. /********************************************
  1948. *
  1949. * Internal EUC string operation functions
  1950. *
  1951. * Extended Unix Code:
  1952. * ASCII compatible 7bit single bytes chars
  1953. * 8bit two byte chars
  1954. *
  1955. * Shift-JIS is treated as a special case.
  1956. *
  1957. ********************************************/
  1958. /**
  1959. * Cuts a string in the EUC charset family short at a given byte length.
  1960. *
  1961. * @param string EUC multibyte character string
  1962. * @param integer the byte length
  1963. * @param string the charset
  1964. * @return string the shortened string
  1965. * @see mb_strcut()
  1966. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1967. */
  1968. function euc_strtrunc($str, $len, $charset) {
  1969. $sjis = ($charset == 'shift_jis');
  1970. for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
  1971. $c = ord($str{$i});
  1972. if ($sjis) {
  1973. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  1974. $i++;
  1975. } // advance a double-byte char
  1976. }
  1977. else {
  1978. if ($c >= 0x80) {
  1979. $i++;
  1980. } // advance a double-byte char
  1981. }
  1982. }
  1983. if (!strlen($str{$i})) {
  1984. return $str;
  1985. } // string shorter than supplied length
  1986. if ($i > $len) {
  1987. return substr($str, 0, $len - 1); // we ended on a first byte
  1988. } else {
  1989. return substr($str, 0, $len);
  1990. }
  1991. }
  1992. /**
  1993. * Returns a part of a string in the EUC charset family.
  1994. *
  1995. * @param string EUC multibyte character string
  1996. * @param integer start position (character position)
  1997. * @param string the charset
  1998. * @param integer length (in characters)
  1999. * @return string the substring
  2000. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2001. */
  2002. function euc_substr($str, $start, $charset, $len = NULL) {
  2003. $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
  2004. if ($byte_start === FALSE) {
  2005. return FALSE;
  2006. } // $start outside string length
  2007. $str = substr($str, $byte_start);
  2008. if ($len != NULL) {
  2009. $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
  2010. if ($byte_end === FALSE) // $len outside actual string length
  2011. {
  2012. return $str;
  2013. }
  2014. else
  2015. {
  2016. return substr($str, 0, $byte_end);
  2017. }
  2018. }
  2019. else {
  2020. return $str;
  2021. }
  2022. }
  2023. /**
  2024. * Counts the number of characters of a string in the EUC charset family.
  2025. *
  2026. * @param string EUC multibyte character string
  2027. * @param string the charset
  2028. * @return integer the number of characters
  2029. * @see strlen()
  2030. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2031. */
  2032. function euc_strlen($str, $charset) {
  2033. $sjis = ($charset == 'shift_jis');
  2034. $n = 0;
  2035. for ($i = 0; strlen($str{$i}); $i++) {
  2036. $c = ord($str{$i});
  2037. if ($sjis) {
  2038. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  2039. $i++;
  2040. } // advance a double-byte char
  2041. }
  2042. else {
  2043. if ($c >= 0x80) {
  2044. $i++;
  2045. } // advance a double-byte char
  2046. }
  2047. $n++;
  2048. }
  2049. return $n;
  2050. }
  2051. /**
  2052. * Translates a character position into an 'absolute' byte position.
  2053. *
  2054. * @param string EUC multibyte character string
  2055. * @param integer character position (negative values start from the end)
  2056. * @param string the charset
  2057. * @return integer byte position
  2058. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2059. */
  2060. function euc_char2byte_pos($str, $pos, $charset) {
  2061. $sjis = ($charset == 'shift_jis');
  2062. $n = 0; // number of characters seen
  2063. $p = abs($pos); // number of characters wanted
  2064. if ($pos >= 0) {
  2065. $i = 0;
  2066. $d = 1;
  2067. } else {
  2068. $i = strlen($str) - 1;
  2069. $d = -1;
  2070. }
  2071. for (; strlen($str{$i}) && $n < $p; $i += $d) {
  2072. $c = ord($str{$i});
  2073. if ($sjis) {
  2074. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  2075. $i += $d;
  2076. } // advance a double-byte char
  2077. }
  2078. else {
  2079. if ($c >= 0x80) {
  2080. $i += $d;
  2081. } // advance a double-byte char
  2082. }
  2083. $n++;
  2084. }
  2085. if (!strlen($str{$i})) {
  2086. return FALSE;
  2087. } // offset beyond string length
  2088. if ($pos < 0) {
  2089. $i++;
  2090. } // correct offset
  2091. return $i;
  2092. }
  2093. /**
  2094. * Maps all characters of a string in the EUC charset family.
  2095. *
  2096. * @param string EUC multibyte character string
  2097. * @param string the charset
  2098. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  2099. * @param string 'case': conversion 'toLower' or 'toUpper'
  2100. * @return string the converted string
  2101. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2102. */
  2103. function euc_char_mapping($str, $charset, $mode, $opt = '') {
  2104. switch ($mode) {
  2105. case 'case':
  2106. if (!$this->initCaseFolding($charset)) {
  2107. return $str;
  2108. } // do nothing
  2109. $map =& $this->caseFolding[$charset][$opt];
  2110. break;
  2111. case 'ascii':
  2112. if (!$this->initToASCII($charset)) {
  2113. return $str;
  2114. } // do nothing
  2115. $map =& $this->toASCII[$charset];
  2116. break;
  2117. default:
  2118. return $str;
  2119. }
  2120. $sjis = ($charset == 'shift_jis');
  2121. $out = '';
  2122. for ($i = 0; strlen($str{$i}); $i++) {
  2123. $mbc = $str{$i};
  2124. $c = ord($mbc);
  2125. if ($sjis) {
  2126. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
  2127. $mbc = substr($str, $i, 2);
  2128. $i++;
  2129. }
  2130. }
  2131. else {
  2132. if ($c >= 0x80) { // a double-byte char
  2133. $mbc = substr($str, $i, 2);
  2134. $i++;
  2135. }
  2136. }
  2137. if (isset($map[$mbc])) {
  2138. $out .= $map[$mbc];
  2139. } else {
  2140. $out .= $mbc;
  2141. }
  2142. }
  2143. return $out;
  2144. }
  2145. }
  2146. if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
  2147. include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
  2148. }
  2149. ?>