PageRenderTime 79ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/typo3/class.t3lib_cs.php

https://bitbucket.org/kudutest1/moodlegit
PHP | 1529 lines | 1075 code | 131 blank | 323 comment | 226 complexity | f8151d81360df9439f24956927dc4251 MD5 | raw file
  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the Typo3 project. The Typo3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. *
  17. * This script is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * This copyright notice MUST APPEAR in all copies of the script!
  23. ***************************************************************/
  24. /**
  25. * Class for conversion between charsets.
  26. *
  27. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  28. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  29. */
  30. /**
  31. * Notes on UTF-8
  32. *
  33. * Functions working on UTF-8 strings:
  34. *
  35. * - strchr/strstr
  36. * - strrchr
  37. * - substr_count
  38. * - implode/explode/join
  39. *
  40. * Functions nearly working on UTF-8 strings:
  41. *
  42. * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  43. * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  44. * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  45. * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  46. * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  47. *
  48. * Functions NOT working on UTF-8 strings:
  49. *
  50. * - str*cmp
  51. * - stristr
  52. * - stripos
  53. * - substr
  54. * - strrev
  55. * - split/spliti
  56. * - ...
  57. *
  58. */
  59. /**
  60. * Class for conversion between charsets
  61. *
  62. * @author Kasper Skårhøj <kasperYYYY@typo3.com>
  63. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  64. * @package TYPO3
  65. * @subpackage t3lib
  66. */
  67. class t3lib_cs {
  68. /**
  69. * @var t3lib_l10n_Locales
  70. */
  71. protected $locales;
  72. var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  73. // This is the array where parsed conversion tables are stored (cached)
  74. var $parsedCharsets = array();
  75. // An array where case folding data will be stored (cached)
  76. var $caseFolding = array();
  77. // An array where charset-to-ASCII mappings are stored (cached)
  78. var $toASCII = array();
  79. // This tells the converter which charsets has two bytes per char:
  80. var $twoByteSets = array(
  81. 'ucs-2' => 1, // 2-byte Unicode
  82. );
  83. // This tells the converter which charsets has four bytes per char:
  84. var $fourByteSets = array(
  85. 'ucs-4' => 1, // 4-byte Unicode
  86. 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  87. );
  88. // This tells the converter which charsets use a scheme like the Extended Unix Code:
  89. var $eucBasedSets = array(
  90. 'gb2312' => 1, // Chinese, simplified.
  91. 'big5' => 1, // Chinese, traditional.
  92. 'euc-kr' => 1, // Korean
  93. 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
  94. );
  95. // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
  96. // http://czyborra.com/charsets/iso8859.html
  97. var $synonyms = array(
  98. 'us' => 'ascii',
  99. 'us-ascii' => 'ascii',
  100. 'cp819' => 'iso-8859-1',
  101. 'ibm819' => 'iso-8859-1',
  102. 'iso-ir-100' => 'iso-8859-1',
  103. 'iso-ir-101' => 'iso-8859-2',
  104. 'iso-ir-109' => 'iso-8859-3',
  105. 'iso-ir-110' => 'iso-8859-4',
  106. 'iso-ir-144' => 'iso-8859-5',
  107. 'iso-ir-127' => 'iso-8859-6',
  108. 'iso-ir-126' => 'iso-8859-7',
  109. 'iso-ir-138' => 'iso-8859-8',
  110. 'iso-ir-148' => 'iso-8859-9',
  111. 'iso-ir-157' => 'iso-8859-10',
  112. 'iso-ir-179' => 'iso-8859-13',
  113. 'iso-ir-199' => 'iso-8859-14',
  114. 'iso-ir-203' => 'iso-8859-15',
  115. 'csisolatin1' => 'iso-8859-1',
  116. 'csisolatin2' => 'iso-8859-2',
  117. 'csisolatin3' => 'iso-8859-3',
  118. 'csisolatin5' => 'iso-8859-9',
  119. 'csisolatin8' => 'iso-8859-14',
  120. 'csisolatin9' => 'iso-8859-15',
  121. 'csisolatingreek' => 'iso-8859-7',
  122. 'iso-celtic' => 'iso-8859-14',
  123. 'latin1' => 'iso-8859-1',
  124. 'latin2' => 'iso-8859-2',
  125. 'latin3' => 'iso-8859-3',
  126. 'latin5' => 'iso-8859-9',
  127. 'latin6' => 'iso-8859-10',
  128. 'latin8' => 'iso-8859-14',
  129. 'latin9' => 'iso-8859-15',
  130. 'l1' => 'iso-8859-1',
  131. 'l2' => 'iso-8859-2',
  132. 'l3' => 'iso-8859-3',
  133. 'l5' => 'iso-8859-9',
  134. 'l6' => 'iso-8859-10',
  135. 'l8' => 'iso-8859-14',
  136. 'l9' => 'iso-8859-15',
  137. 'cyrillic' => 'iso-8859-5',
  138. 'arabic' => 'iso-8859-6',
  139. 'tis-620' => 'iso-8859-11',
  140. 'win874' => 'windows-874',
  141. 'win1250' => 'windows-1250',
  142. 'win1251' => 'windows-1251',
  143. 'win1252' => 'windows-1252',
  144. 'win1253' => 'windows-1253',
  145. 'win1254' => 'windows-1254',
  146. 'win1255' => 'windows-1255',
  147. 'win1256' => 'windows-1256',
  148. 'win1257' => 'windows-1257',
  149. 'win1258' => 'windows-1258',
  150. 'cp1250' => 'windows-1250',
  151. 'cp1251' => 'windows-1251',
  152. 'cp1252' => 'windows-1252',
  153. 'ms-ee' => 'windows-1250',
  154. 'ms-ansi' => 'windows-1252',
  155. 'ms-greek' => 'windows-1253',
  156. 'ms-turk' => 'windows-1254',
  157. 'winbaltrim' => 'windows-1257',
  158. 'koi-8ru' => 'koi-8r',
  159. 'koi8r' => 'koi-8r',
  160. 'cp878' => 'koi-8r',
  161. 'mac' => 'macroman',
  162. 'macintosh' => 'macroman',
  163. 'euc-cn' => 'gb2312',
  164. 'x-euc-cn' => 'gb2312',
  165. 'euccn' => 'gb2312',
  166. 'cp936' => 'gb2312',
  167. 'big-5' => 'big5',
  168. 'cp950' => 'big5',
  169. 'eucjp' => 'euc-jp',
  170. 'sjis' => 'shift_jis',
  171. 'shift-jis' => 'shift_jis',
  172. 'cp932' => 'shift_jis',
  173. 'cp949' => 'euc-kr',
  174. 'utf7' => 'utf-7',
  175. 'utf8' => 'utf-8',
  176. 'utf16' => 'utf-16',
  177. 'utf32' => 'utf-32',
  178. 'utf8' => 'utf-8',
  179. 'ucs2' => 'ucs-2',
  180. 'ucs4' => 'ucs-4',
  181. );
  182. // mapping of iso-639-1 language codes to script names
  183. var $lang_to_script = array(
  184. // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
  185. 'af' => 'west_european', //Afrikaans
  186. 'ar' => 'arabic',
  187. 'bg' => 'cyrillic', // Bulgarian
  188. 'bs' => 'east_european', // Bosnian
  189. 'cs' => 'east_european', // Czech
  190. 'da' => 'west_european', // Danish
  191. 'de' => 'west_european', // German
  192. 'es' => 'west_european', // Spanish
  193. 'et' => 'estonian',
  194. 'eo' => 'unicode', // Esperanto
  195. 'eu' => 'west_european', // Basque
  196. 'fa' => 'arabic', // Persian
  197. 'fi' => 'west_european', // Finish
  198. 'fo' => 'west_european', // Faroese
  199. 'fr' => 'west_european', // French
  200. 'ga' => 'west_european', // Irish
  201. 'gl' => 'west_european', // Galician
  202. 'gr' => 'greek',
  203. 'he' => 'hebrew', // Hebrew (since 1998)
  204. 'hi' => 'unicode', // Hindi
  205. 'hr' => 'east_european', // Croatian
  206. 'hu' => 'east_european', // Hungarian
  207. 'iw' => 'hebrew', // Hebrew (til 1998)
  208. 'is' => 'west_european', // Icelandic
  209. 'it' => 'west_european', // Italian
  210. 'ja' => 'japanese',
  211. 'ka' => 'unicode', // Georgian
  212. 'kl' => 'west_european', // Greenlandic
  213. 'km' => 'unicode', // Khmer
  214. 'ko' => 'korean',
  215. 'lt' => 'lithuanian',
  216. 'lv' => 'west_european', // Latvian/Lettish
  217. 'nl' => 'west_european', // Dutch
  218. 'no' => 'west_european', // Norwegian
  219. 'nb' => 'west_european', // Norwegian Bokmal
  220. 'nn' => 'west_european', // Norwegian Nynorsk
  221. 'pl' => 'east_european', // Polish
  222. 'pt' => 'west_european', // Portuguese
  223. 'ro' => 'east_european', // Romanian
  224. 'ru' => 'cyrillic', // Russian
  225. 'sk' => 'east_european', // Slovak
  226. 'sl' => 'east_european', // Slovenian
  227. 'sr' => 'cyrillic', // Serbian
  228. 'sv' => 'west_european', // Swedish
  229. 'sq' => 'albanian', // Albanian
  230. 'th' => 'thai',
  231. 'uk' => 'cyrillic', // Ukranian
  232. 'vi' => 'vietnamese',
  233. 'zh' => 'chinese',
  234. // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
  235. // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
  236. 'afk'=> 'west_european', // Afrikaans
  237. 'ara' => 'arabic',
  238. 'bgr' => 'cyrillic', // Bulgarian
  239. 'cat' => 'west_european', // Catalan
  240. 'chs' => 'simpl_chinese',
  241. 'cht' => 'trad_chinese',
  242. 'csy' => 'east_european', // Czech
  243. 'dan' => 'west_european', // Danisch
  244. 'deu' => 'west_european', // German
  245. 'dea' => 'west_european', // German (Austrian)
  246. 'des' => 'west_european', // German (Swiss)
  247. 'ena' => 'west_european', // English (Australian)
  248. 'enc' => 'west_european', // English (Canadian)
  249. 'eng' => 'west_european', // English
  250. 'enz' => 'west_european', // English (New Zealand)
  251. 'enu' => 'west_european', // English (United States)
  252. 'euq' => 'west_european', // Basque
  253. 'fos' => 'west_european', // Faroese
  254. 'far' => 'arabic', // Persian
  255. 'fin' => 'west_european', // Finish
  256. 'fra' => 'west_european', // French
  257. 'frb' => 'west_european', // French (Belgian)
  258. 'frc' => 'west_european', // French (Canadian)
  259. 'frs' => 'west_european', // French (Swiss)
  260. 'geo' => 'unicode', // Georgian
  261. 'glg' => 'west_european', // Galician
  262. 'ell' => 'greek',
  263. 'heb' => 'hebrew',
  264. 'hin' => 'unicode', // Hindi
  265. 'hun' => 'east_european', // Hungarian
  266. 'isl' => 'west_euorpean', // Icelandic
  267. 'ita' => 'west_european', // Italian
  268. 'its' => 'west_european', // Italian (Swiss)
  269. 'jpn' => 'japanese',
  270. 'khm' => 'unicode', // Khmer
  271. 'kor' => 'korean',
  272. 'lth' => 'lithuanian',
  273. 'lvi' => 'west_european', // Latvian/Lettish
  274. 'msl' => 'west_european', // Malay
  275. 'nlb' => 'west_european', // Dutch (Belgian)
  276. 'nld' => 'west_european', // Dutch
  277. 'nor' => 'west_european', // Norwegian (bokmal)
  278. 'non' => 'west_european', // Norwegian (nynorsk)
  279. 'plk' => 'east_european', // Polish
  280. 'ptg' => 'west_european', // Portuguese
  281. 'ptb' => 'west_european', // Portuguese (Brazil)
  282. 'rom' => 'east_european', // Romanian
  283. 'rus' => 'cyrillic', // Russian
  284. 'slv' => 'east_european', // Slovenian
  285. 'sky' => 'east_european', // Slovak
  286. 'srl' => 'east_european', // Serbian (Latin)
  287. 'srb' => 'cyrillic', // Serbian (Cyrillic)
  288. 'esp' => 'west_european', // Spanish (trad. sort)
  289. 'esm' => 'west_european', // Spanish (Mexican)
  290. 'esn' => 'west_european', // Spanish (internat. sort)
  291. 'sve' => 'west_european', // Swedish
  292. 'sqi' => 'albanian', // Albanian
  293. 'tha' => 'thai',
  294. 'trk' => 'turkish',
  295. 'ukr' => 'cyrillic', // Ukrainian
  296. // English language names
  297. 'afrikaans' => 'west_european',
  298. 'albanian' => 'albanian',
  299. 'arabic' => 'arabic',
  300. 'basque' => 'west_european',
  301. 'bosnian' => 'east_european',
  302. 'bulgarian' => 'east_european',
  303. 'catalan' => 'west_european',
  304. 'croatian' => 'east_european',
  305. 'czech' => 'east_european',
  306. 'danish' => 'west_european',
  307. 'dutch' => 'west_european',
  308. 'english' => 'west_european',
  309. 'esperanto' => 'unicode',
  310. 'estonian' => 'estonian',
  311. 'faroese' => 'west_european',
  312. 'farsi' => 'arabic',
  313. 'finnish' => 'west_european',
  314. 'french' => 'west_european',
  315. 'galician' => 'west_european',
  316. 'georgian' => 'unicode',
  317. 'german' => 'west_european',
  318. 'greek' => 'greek',
  319. 'greenlandic' => 'west_european',
  320. 'hebrew' => 'hebrew',
  321. 'hindi' => 'unicode',
  322. 'hungarian' => 'east_european',
  323. 'icelandic' => 'west_european',
  324. 'italian' => 'west_european',
  325. 'khmer' => 'unicode',
  326. 'latvian' => 'west_european',
  327. 'lettish' => 'west_european',
  328. 'lithuanian' => 'lithuanian',
  329. 'malay' => 'west_european',
  330. 'norwegian' => 'west_european',
  331. 'persian' => 'arabic',
  332. 'polish' => 'east_european',
  333. 'portuguese' => 'west_european',
  334. 'russian' => 'cyrillic',
  335. 'romanian' => 'east_european',
  336. 'serbian' => 'cyrillic',
  337. 'slovak' => 'east_european',
  338. 'slovenian' => 'east_european',
  339. 'spanish' => 'west_european',
  340. 'svedish' => 'west_european',
  341. 'that' => 'thai',
  342. 'turkish' => 'turkish',
  343. 'ukrainian' => 'cyrillic',
  344. );
  345. // mapping of language (family) names to charsets on Unix
  346. var $script_to_charset_unix = array(
  347. 'west_european' => 'iso-8859-1',
  348. 'estonian' => 'iso-8859-1',
  349. 'east_european' => 'iso-8859-2',
  350. 'baltic' => 'iso-8859-4',
  351. 'cyrillic' => 'iso-8859-5',
  352. 'arabic' => 'iso-8859-6',
  353. 'greek' => 'iso-8859-7',
  354. 'hebrew' => 'iso-8859-8',
  355. 'turkish' => 'iso-8859-9',
  356. 'thai' => 'iso-8859-11', // = TIS-620
  357. 'lithuanian' => 'iso-8859-13',
  358. 'chinese' => 'gb2312', // = euc-cn
  359. 'japanese' => 'euc-jp',
  360. 'korean' => 'euc-kr',
  361. 'simpl_chinese' => 'gb2312',
  362. 'trad_chinese' => 'big5',
  363. 'vietnamese' => '',
  364. 'unicode' => 'utf-8',
  365. 'albanian' => 'utf-8'
  366. );
  367. // mapping of language (family) names to charsets on Windows
  368. var $script_to_charset_windows = array(
  369. 'east_european' => 'windows-1250',
  370. 'cyrillic' => 'windows-1251',
  371. 'west_european' => 'windows-1252',
  372. 'greek' => 'windows-1253',
  373. 'turkish' => 'windows-1254',
  374. 'hebrew' => 'windows-1255',
  375. 'arabic' => 'windows-1256',
  376. 'baltic' => 'windows-1257',
  377. 'estonian' => 'windows-1257',
  378. 'lithuanian' => 'windows-1257',
  379. 'vietnamese' => 'windows-1258',
  380. 'thai' => 'cp874',
  381. 'korean' => 'cp949',
  382. 'chinese' => 'gb2312',
  383. 'japanese' => 'shift_jis',
  384. 'simpl_chinese' => 'gb2312',
  385. 'trad_chinese' => 'big5',
  386. 'albanian' => 'windows-1250',
  387. 'unicode' => 'utf-8'
  388. );
  389. // mapping of locale names to charsets
  390. var $locale_to_charset = array(
  391. 'japanese.euc' => 'euc-jp',
  392. 'ja_jp.ujis' => 'euc-jp',
  393. 'korean.euc' => 'euc-kr',
  394. 'sr@Latn' => 'iso-8859-2',
  395. 'zh_cn' => 'gb2312',
  396. 'zh_hk' => 'big5',
  397. 'zh_tw' => 'big5',
  398. );
  399. // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
  400. // Empty values means "iso-8859-1"
  401. var $charSetArray = array(
  402. 'af' => '',
  403. 'ar' => 'iso-8859-6',
  404. 'ba' => 'iso-8859-2',
  405. 'bg' => 'windows-1251',
  406. 'br' => '',
  407. 'ca' => 'iso-8859-15',
  408. 'ch' => 'gb2312',
  409. 'cs' => 'windows-1250',
  410. 'cz' => 'windows-1250',
  411. 'da' => '',
  412. 'de' => '',
  413. 'dk' => '',
  414. 'el' => 'iso-8859-7',
  415. 'eo' => 'utf-8',
  416. 'es' => '',
  417. 'et' => 'iso-8859-4',
  418. 'eu' => '',
  419. 'fa' => 'utf-8',
  420. 'fi' => '',
  421. 'fo' => 'utf-8',
  422. 'fr' => '',
  423. 'fr_CA' => '',
  424. 'ga' => '',
  425. 'ge' => 'utf-8',
  426. 'gl' => '',
  427. 'gr' => 'iso-8859-7',
  428. 'he' => 'utf-8',
  429. 'hi' => 'utf-8',
  430. 'hk' => 'big5',
  431. 'hr' => 'windows-1250',
  432. 'hu' => 'iso-8859-2',
  433. 'is' => 'utf-8',
  434. 'it' => '',
  435. 'ja' => 'shift_jis',
  436. 'jp' => 'shift_jis',
  437. 'ka' => 'utf-8',
  438. 'kl' => 'utf-8',
  439. 'km' => 'utf-8',
  440. 'ko' => 'euc-kr',
  441. 'kr' => 'euc-kr',
  442. 'lt' => 'windows-1257',
  443. 'lv' => 'utf-8',
  444. 'ms' => '',
  445. 'my' => '',
  446. 'nl' => '',
  447. 'no' => '',
  448. 'pl' => 'iso-8859-2',
  449. 'pt' => '',
  450. 'pt_BR' => '',
  451. 'qc' => '',
  452. 'ro' => 'iso-8859-2',
  453. 'ru' => 'windows-1251',
  454. 'se' => '',
  455. 'si' => 'windows-1250',
  456. 'sk' => 'windows-1250',
  457. 'sl' => 'windows-1250',
  458. 'sq' => 'utf-8',
  459. 'sr' => 'utf-8',
  460. 'sv' => '',
  461. 'th' => 'iso-8859-11',
  462. 'tr' => 'iso-8859-9',
  463. 'ua' => 'windows-1251',
  464. 'uk' => 'windows-1251',
  465. 'vi' => 'utf-8',
  466. 'vn' => 'utf-8',
  467. 'zh' => 'big5',
  468. );
  469. // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
  470. // Missing keys means: same as TYPO3
  471. // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
  472. var $isoArray = array(
  473. 'ba' => 'bs',
  474. 'br' => 'pt_BR',
  475. 'ch' => 'zh_CN',
  476. 'cz' => 'cs',
  477. 'dk' => 'da',
  478. 'si' => 'sl',
  479. 'se' => 'sv',
  480. 'gl' => 'kl',
  481. 'gr' => 'el',
  482. 'hk' => 'zh_HK',
  483. 'kr' => 'ko',
  484. 'ua' => 'uk',
  485. 'jp' => 'ja',
  486. 'qc' => 'fr_CA',
  487. 'vn' => 'vi',
  488. 'ge' => 'ka',
  489. 'ga' => 'gl',
  490. );
  491. /**
  492. * Default constructor.
  493. */
  494. public function __construct() {
  495. $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
  496. }
  497. /**
  498. * Normalize - changes input character set to lowercase letters.
  499. *
  500. * @param string Input charset
  501. * @return string Normalized charset
  502. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  503. */
  504. function parse_charset($charset) {
  505. $charset = trim(strtolower($charset));
  506. if (isset($this->synonyms[$charset])) {
  507. $charset = $this->synonyms[$charset];
  508. }
  509. return $charset;
  510. }
  511. /**
  512. * Get the charset of a locale.
  513. *
  514. * ln language
  515. * ln_CN language / country
  516. * ln_CN.cs language / country / charset
  517. * ln_CN.cs@mod language / country / charset / modifier
  518. *
  519. * @param string Locale string
  520. * @return string Charset resolved for locale string
  521. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  522. */
  523. function get_locale_charset($locale) {
  524. $locale = strtolower($locale);
  525. // exact locale specific charset?
  526. if (isset($this->locale_to_charset[$locale])) {
  527. return $this->locale_to_charset[$locale];
  528. }
  529. // get modifier
  530. list($locale, $modifier) = explode('@', $locale);
  531. // locale contains charset: use it
  532. list($locale, $charset) = explode('.', $locale);
  533. if ($charset) {
  534. return $this->parse_charset($charset);
  535. }
  536. // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
  537. if ($modifier == 'euro') {
  538. return 'iso-8859-15';
  539. }
  540. // get language
  541. list($language, $country) = explode('_', $locale);
  542. if (isset($this->lang_to_script[$language])) {
  543. $script = $this->lang_to_script[$language];
  544. }
  545. if (TYPO3_OS == 'WIN') {
  546. $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
  547. } else {
  548. $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
  549. }
  550. return $cs;
  551. }
  552. /********************************************
  553. *
  554. * Charset Conversion functions
  555. *
  556. ********************************************/
  557. /**
  558. * Convert from one charset to another charset.
  559. *
  560. * @param string Input string
  561. * @param string From charset (the current charset of the string)
  562. * @param string To charset (the output charset wanted)
  563. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  564. * @return string Converted string
  565. * @see convArray()
  566. */
  567. function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
  568. if ($fromCS == $toCS) {
  569. return $str;
  570. }
  571. // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
  572. if ($toCS == 'utf-8' || !$useEntityForNoChar) {
  573. switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
  574. case 'mbstring':
  575. $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
  576. if (FALSE !== $conv_str) {
  577. return $conv_str;
  578. } // returns FALSE for unsupported charsets
  579. break;
  580. case 'iconv':
  581. $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
  582. if (FALSE !== $conv_str) {
  583. return $conv_str;
  584. }
  585. break;
  586. case 'recode':
  587. $conv_str = recode_string($fromCS . '..' . $toCS, $str);
  588. if (FALSE !== $conv_str) {
  589. return $conv_str;
  590. }
  591. break;
  592. }
  593. // fallback to TYPO3 conversion
  594. }
  595. if ($fromCS != 'utf-8') {
  596. $str = $this->utf8_encode($str, $fromCS);
  597. }
  598. if ($toCS != 'utf-8') {
  599. $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
  600. }
  601. return $str;
  602. }
  603. /**
  604. * Convert all elements in ARRAY with type string from one charset to another charset.
  605. * NOTICE: Array is passed by reference!
  606. *
  607. * @param string Input array, possibly multidimensional
  608. * @param string From charset (the current charset of the string)
  609. * @param string To charset (the output charset wanted)
  610. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  611. * @return void
  612. * @see conv()
  613. */
  614. function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
  615. foreach ($array as $key => $value) {
  616. if (is_array($array[$key])) {
  617. $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  618. } elseif (is_string($array[$key])) {
  619. $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  620. }
  621. }
  622. }
  623. /**
  624. * Converts $str from $charset to UTF-8
  625. *
  626. * @param string String in local charset to convert to UTF-8
  627. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  628. * @return string Output string, converted to UTF-8
  629. */
  630. function utf8_encode($str, $charset) {
  631. if ($charset === 'utf-8') {
  632. return $str;
  633. }
  634. // Charset is case-insensitive.
  635. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  636. $strLen = strlen($str);
  637. $outStr = '';
  638. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
  639. $chr = substr($str, $a, 1);
  640. $ord = ord($chr);
  641. if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
  642. $ord2 = ord($str{$a + 1});
  643. $ord = $ord << 8 | $ord2; // assume big endian
  644. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  645. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  646. } else {
  647. $outStr .= chr($this->noCharByteVal);
  648. } // No char exists
  649. $a++;
  650. } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
  651. if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
  652. if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
  653. $a++;
  654. $ord2 = ord(substr($str, $a, 1));
  655. $ord = $ord * 256 + $ord2;
  656. }
  657. }
  658. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  659. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  660. } else {
  661. $outStr .= chr($this->noCharByteVal);
  662. } // No char exists
  663. } else {
  664. $outStr .= $chr;
  665. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  666. }
  667. return $outStr;
  668. }
  669. }
  670. /**
  671. * Converts $str from UTF-8 to $charset
  672. *
  673. * @param string String in UTF-8 to convert to local charset
  674. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  675. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  676. * @return string Output string, converted to local charset
  677. */
  678. function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
  679. if ($charset === 'utf-8') {
  680. return $str;
  681. }
  682. // Charset is case-insensitive.
  683. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  684. $strLen = strlen($str);
  685. $outStr = '';
  686. $buf = '';
  687. for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
  688. $chr = substr($str, $a, 1);
  689. $ord = ord($chr);
  690. if ($ord > 127) { // This means multibyte! (first byte!)
  691. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  692. $buf = $chr; // Add first byte
  693. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  694. $ord = $ord << 1; // Shift it left and ...
  695. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  696. $a++; // Increase pointer...
  697. $buf .= substr($str, $a, 1); // ... and add the next char.
  698. } else {
  699. break;
  700. }
  701. }
  702. if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
  703. $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
  704. if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
  705. $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
  706. } else {
  707. $outStr .= chr($mByte);
  708. }
  709. } elseif ($useEntityForNoChar) { // Create num entity:
  710. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  711. } else {
  712. $outStr .= chr($this->noCharByteVal);
  713. } // No char exists
  714. } else {
  715. $outStr .= chr($this->noCharByteVal);
  716. } // No char exists (MIDDLE of MB sequence!)
  717. } else {
  718. $outStr .= $chr;
  719. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  720. }
  721. return $outStr;
  722. }
  723. }
  724. /**
  725. * Converts all chars > 127 to numeric entities.
  726. *
  727. * @param string Input string
  728. * @return string Output string
  729. */
  730. function utf8_to_entities($str) {
  731. $strLen = strlen($str);
  732. $outStr = '';
  733. $buf = '';
  734. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  735. $chr = substr($str, $a, 1);
  736. $ord = ord($chr);
  737. if ($ord > 127) { // This means multibyte! (first byte!)
  738. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  739. $buf = $chr; // Add first byte
  740. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  741. $ord = $ord << 1; // Shift it left and ...
  742. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  743. $a++; // Increase pointer...
  744. $buf .= substr($str, $a, 1); // ... and add the next char.
  745. } else {
  746. break;
  747. }
  748. }
  749. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  750. } else {
  751. $outStr .= chr($this->noCharByteVal);
  752. } // No char exists (MIDDLE of MB sequence!)
  753. } else {
  754. $outStr .= $chr;
  755. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  756. }
  757. return $outStr;
  758. }
  759. /**
  760. * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
  761. *
  762. * @param string Input string, UTF-8
  763. * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
  764. * @return string Output string
  765. */
  766. function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
  767. if ($alsoStdHtmlEnt) {
  768. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
  769. }
  770. $token = md5(microtime());
  771. $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
  772. foreach ($parts as $k => $v) {
  773. // only take every second element
  774. if ($k % 2 === 0) {
  775. continue;
  776. }
  777. $position = 0;
  778. if (substr($v, $position, 1) == '#') { // Dec or hex entities:
  779. $position++;
  780. if (substr($v, $position, 1) == 'x') {
  781. $v = hexdec(substr($v, ++$position));
  782. } else {
  783. $v = substr($v, $position);
  784. }
  785. $parts[$k] = $this->UnumberToChar($v);
  786. } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
  787. $parts[$k] = $trans_tbl['&' . $v . ';'];
  788. } else { // No conversion:
  789. $parts[$k] = '&' . $v . ';';
  790. }
  791. }
  792. return implode('', $parts);
  793. }
  794. /**
  795. * Converts all chars in the input UTF-8 string into integer numbers returned in an array
  796. *
  797. * @param string Input string, UTF-8
  798. * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
  799. * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
  800. * @return array Output array with the char numbers
  801. */
  802. function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
  803. // If entities must be registered as well...:
  804. if ($convEntities) {
  805. $str = $this->entities_to_utf8($str, 1);
  806. }
  807. // Do conversion:
  808. $strLen = strlen($str);
  809. $outArr = array();
  810. $buf = '';
  811. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  812. $chr = substr($str, $a, 1);
  813. $ord = ord($chr);
  814. if ($ord > 127) { // This means multibyte! (first byte!)
  815. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  816. $buf = $chr; // Add first byte
  817. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  818. $ord = $ord << 1; // Shift it left and ...
  819. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  820. $a++; // Increase pointer...
  821. $buf .= substr($str, $a, 1); // ... and add the next char.
  822. } else {
  823. break;
  824. }
  825. }
  826. $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
  827. } else {
  828. $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
  829. } // No char exists (MIDDLE of MB sequence!)
  830. } else {
  831. $outArr[] = $retChar ? chr($ord) : $ord;
  832. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  833. }
  834. return $outArr;
  835. }
  836. /**
  837. * Converts a UNICODE number to a UTF-8 multibyte character
  838. * Algorithm based on script found at From: http://czyborra.com/utf/
  839. * Unit-tested by Kasper
  840. *
  841. * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
  842. *
  843. * bytes | bits | representation
  844. * 1 | 7 | 0vvvvvvv
  845. * 2 | 11 | 110vvvvv 10vvvvvv
  846. * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
  847. * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  848. * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  849. * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  850. *
  851. * @param integer UNICODE integer
  852. * @return string UTF-8 multibyte character string
  853. * @see utf8CharToUnumber()
  854. */
  855. function UnumberToChar($cbyte) {
  856. $str = '';
  857. if ($cbyte < 0x80) {
  858. $str .= chr($cbyte);
  859. } else {
  860. if ($cbyte < 0x800) {
  861. $str .= chr(0xC0 | ($cbyte >> 6));
  862. $str .= chr(0x80 | ($cbyte & 0x3F));
  863. } else {
  864. if ($cbyte < 0x10000) {
  865. $str .= chr(0xE0 | ($cbyte >> 12));
  866. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  867. $str .= chr(0x80 | ($cbyte & 0x3F));
  868. } else {
  869. if ($cbyte < 0x200000) {
  870. $str .= chr(0xF0 | ($cbyte >> 18));
  871. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  872. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  873. $str .= chr(0x80 | ($cbyte & 0x3F));
  874. } else {
  875. if ($cbyte < 0x4000000) {
  876. $str .= chr(0xF8 | ($cbyte >> 24));
  877. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  878. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  879. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  880. $str .= chr(0x80 | ($cbyte & 0x3F));
  881. } else {
  882. if ($cbyte < 0x80000000) {
  883. $str .= chr(0xFC | ($cbyte >> 30));
  884. $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
  885. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  886. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  887. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  888. $str .= chr(0x80 | ($cbyte & 0x3F));
  889. } else { // Cannot express a 32-bit character in UTF-8
  890. $str .= chr($this->noCharByteVal);
  891. }
  892. }
  893. }
  894. }
  895. }
  896. }
  897. return $str;
  898. }
  899. /**
  900. * Converts a UTF-8 Multibyte character to a UNICODE number
  901. * Unit-tested by Kasper
  902. *
  903. * @param string UTF-8 multibyte character string
  904. * @param boolean If set, then a hex. number is returned.
  905. * @return integer UNICODE integer
  906. * @see UnumberToChar()
  907. */
  908. function utf8CharToUnumber($str, $hex = 0) {
  909. $ord = ord(substr($str, 0, 1)); // First char
  910. if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
  911. $binBuf = '';
  912. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  913. $ord = $ord << 1; // Shift it left and ...
  914. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  915. $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
  916. } else {
  917. break;
  918. }
  919. }
  920. $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
  921. $int = bindec($binBuf);
  922. } else {
  923. $int = $ord;
  924. }
  925. return $hex ? 'x' . dechex($int) : $int;
  926. }
  927. /********************************************
  928. *
  929. * Init functions
  930. *
  931. ********************************************/
  932. /**
  933. * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
  934. * This function is automatically called by the conversion functions
  935. *
  936. * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
  937. *
  938. * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
  939. * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
  940. * @access private
  941. */
  942. function initCharset($charset) {
  943. // Only process if the charset is not yet loaded:
  944. if (!is_array($this->parsedCharsets[$charset])) {
  945. // Conversion table filename:
  946. $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
  947. // If the conversion table is found:
  948. if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
  949. // Cache file for charsets:
  950. // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
  951. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
  952. if ($cacheFile && @is_file($cacheFile)) {
  953. $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  954. } else {
  955. // Parse conversion table into lines:
  956. $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
  957. // Initialize the internal variable holding the conv. table:
  958. $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
  959. // traverse the lines:
  960. $detectedType = '';
  961. foreach ($lines as $value) {
  962. if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
  963. // Detect type if not done yet: (Done on first real line)
  964. // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
  965. if (!$detectedType) {
  966. $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
  967. }
  968. if ($detectedType == 'ms-token') {
  969. list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
  970. } elseif ($detectedType == 'whitespaced') {
  971. $regA = array();
  972. preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
  973. $hexbyte = $regA[1];
  974. $utf8 = 'U+' . $regA[2];
  975. }
  976. $decval = hexdec(trim($hexbyte));
  977. if ($decval > 127) {
  978. $utf8decval = hexdec(substr(trim($utf8), 2));
  979. $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
  980. $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
  981. }
  982. }
  983. }
  984. if ($cacheFile) {
  985. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
  986. }
  987. }
  988. return 2;
  989. } else {
  990. return FALSE;
  991. }
  992. } else {
  993. return 1;
  994. }
  995. }
  996. /**
  997. * This function initializes all UTF-8 character data tables.
  998. *
  999. * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
  1000. *
  1001. * @param string Mode ("case", "ascii", ...)
  1002. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1003. * @access private
  1004. */
  1005. function initUnicodeData($mode = NULL) {
  1006. // cache files
  1007. $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
  1008. $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
  1009. // Only process if the tables are not yet loaded
  1010. switch ($mode) {
  1011. case 'case':
  1012. if (is_array($this->caseFolding['utf-8'])) {
  1013. return 1;
  1014. }
  1015. // Use cached version if possible
  1016. if ($cacheFileCase && @is_file($cacheFileCase)) {
  1017. $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
  1018. return 2;
  1019. }
  1020. break;
  1021. case 'ascii':
  1022. if (is_array($this->toASCII['utf-8'])) {
  1023. return 1;
  1024. }
  1025. // Use cached version if possible
  1026. if ($cacheFileASCII && @is_file($cacheFileASCII)) {
  1027. $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
  1028. return 2;
  1029. }
  1030. break;
  1031. }
  1032. // process main Unicode data file
  1033. $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
  1034. if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
  1035. return FALSE;
  1036. }
  1037. $fh = fopen($unicodeDataFile, 'rb');
  1038. if (!$fh) {
  1039. return FALSE;
  1040. }
  1041. // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
  1042. // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
  1043. $this->caseFolding['utf-8'] = array();
  1044. $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
  1045. $utf8CaseFolding['toUpper'] = array();
  1046. $utf8CaseFolding['toLower'] = array();
  1047. $utf8CaseFolding['toTitle'] = array();
  1048. $decomposition = array(); // array of temp. decompositions
  1049. $mark = array(); // array of chars that are marks (eg. composing accents)
  1050. $number = array(); // array of chars that are numbers (eg. digits)
  1051. $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
  1052. while (!feof($fh)) {
  1053. $line = fgets($fh, 4096);
  1054. // has a lot of info
  1055. list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
  1056. $ord = hexdec($char);
  1057. if ($ord > 0xFFFF) {
  1058. break;
  1059. } // only process the BMP
  1060. $utf8_char = $this->UnumberToChar($ord);
  1061. if ($upper) {
  1062. $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
  1063. }
  1064. if ($lower) {
  1065. $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
  1066. }
  1067. // store "title" only when different from "upper" (only a few)
  1068. if ($title && $title != $upper) {
  1069. $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
  1070. }
  1071. switch ($cat{0}) {
  1072. case 'M': // mark (accent, umlaut, ...)
  1073. $mark["U+$char"] = 1;
  1074. break;
  1075. case 'N': // numeric value
  1076. if ($ord > 0x80 && $num != '') {
  1077. $number["U+$char"] = $num;
  1078. }
  1079. }
  1080. // accented Latin letters without "official" decomposition
  1081. $match = array();
  1082. if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
  1083. $c = ord($match[2]);
  1084. if ($match[1] == 'SMALL') {
  1085. $c += 32;
  1086. }
  1087. $decomposition["U+$char"] = array(dechex($c));
  1088. continue;
  1089. }
  1090. $match = array();
  1091. if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
  1092. switch ($match[1]) {
  1093. case '<circle>': // add parenthesis as circle replacement, eg (1)
  1094. $match[2] = '0028 ' . $match[2] . ' 0029';
  1095. break;
  1096. case '<square>': // add square brackets as square replacement, eg [1]
  1097. $match[2] = '005B ' . $match[2] . ' 005D';
  1098. break;
  1099. case '<compat>': // ignore multi char decompositions that start with a space
  1100. if (preg_match('/^0020 /', $match[2])) {
  1101. continue 2;
  1102. }
  1103. break;
  1104. // ignore Arabic and vertical layout presentation decomposition
  1105. case '<initial>':
  1106. case '<medial>':
  1107. case '<final>':
  1108. case '<isolated>':
  1109. case '<vertical>':
  1110. continue 2;
  1111. }
  1112. $decomposition["U+$char"] = explode(' ', $match[2]);
  1113. }
  1114. }
  1115. fclose($fh);
  1116. // process additional Unicode data for casing (allow folded characters to expand into a sequence)
  1117. $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
  1118. if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
  1119. $fh = fopen($specialCasingFile, 'rb');
  1120. if ($fh) {
  1121. while (!feof($fh)) {
  1122. $line = fgets($fh, 4096);
  1123. if ($line{0} != '#' && trim($line) != '') {
  1124. list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
  1125. if ($cond == '' || $cond{0} == '#') {
  1126. $utf8_char = $this->UnumberToChar(hexdec($char));
  1127. if ($char != $lower) {
  1128. $arr = explode(' ', $lower);
  1129. for ($i = 0; isset($arr[$i]); $i++) {
  1130. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1131. }
  1132. $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
  1133. }
  1134. if ($char != $title && $title != $upper) {
  1135. $arr = explode(' ', $title);
  1136. for ($i = 0; isset($arr[$i]); $i++) {
  1137. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1138. }
  1139. $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
  1140. }
  1141. if ($char != $upper) {
  1142. $arr = explode(' ', $upper);
  1143. for ($i = 0; isset($arr[$i]); $i++) {
  1144. $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1145. }
  1146. $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
  1147. }
  1148. }
  1149. }
  1150. }
  1151. fclose($fh);
  1152. }
  1153. }
  1154. // process custom decompositions
  1155. $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
  1156. if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
  1157. $fh = fopen($customTranslitFile, 'rb');
  1158. if ($fh) {
  1159. while (!feof($fh)) {
  1160. $line = fgets($fh, 4096);
  1161. if ($line{0} != '#' && trim($line) != '') {
  1162. list($char, $translit) = t3lib_div::trimExplode(';', $line);
  1163. if (!$translit) {
  1164. $omit["U+$char"] = 1;
  1165. }
  1166. $decomposition["U+$char"] = explode(' ', $translit);
  1167. }
  1168. }
  1169. fclose($fh);
  1170. }
  1171. }
  1172. // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
  1173. foreach ($decomposition as $from => $to) {
  1174. $code_decomp = array();
  1175. while ($code_value = array_shift($to)) {
  1176. if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
  1177. foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
  1178. array_unshift($to, $cv);
  1179. }
  1180. } elseif (!isset($mark["U+$code_value"])) { // remove mark
  1181. array_push($code_decomp, $code_value);
  1182. }
  1183. }
  1184. if (count($code_decomp) || isset($omit[$from])) {
  1185. $decomposition[$from] = $code_decomp;
  1186. } else {
  1187. unset($decomposition[$from]);
  1188. }
  1189. }
  1190. // create ascii only mapping
  1191. $this->toASCII['utf-8'] = array();
  1192. $ascii =& $this->toASCII['utf-8'];
  1193. foreach ($decomposition as $from => $to) {
  1194. $code_decomp = array();
  1195. while ($code_value = array_shift($to)) {
  1196. $ord = hexdec($code_value);
  1197. if ($ord > 127) {
  1198. continue 2;
  1199. } // skip decompositions containing non-ASCII chars
  1200. else
  1201. {
  1202. array_push($code_decomp, chr($ord));
  1203. }
  1204. }
  1205. $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
  1206. }
  1207. // add numeric decompositions
  1208. foreach ($number as $from => $to) {
  1209. $utf8_char = $this->UnumberToChar(hexdec($from));
  1210. if (!isset($ascii[$utf8_char])) {
  1211. $ascii[$utf8_char] = $to;
  1212. }
  1213. }
  1214. if ($cacheFileCase) {
  1215. t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
  1216. }
  1217. if ($cacheFileASCII) {
  1218. t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
  1219. }
  1220. return 3;
  1221. }
  1222. /**
  1223. * This function initializes the folding table for a charset other than UTF-8.
  1224. * This function is automatically called by the case folding functions.
  1225. *
  1226. * @param string Charset for which to initialize case folding.
  1227. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1228. * @access private
  1229. */
  1230. function initCaseFolding($charset) {
  1231. // Only process if the case table is not yet loaded:
  1232. if (is_array($this->caseFolding[$charset])) {
  1233. return 1;
  1234. }
  1235. // Use cached version if possible
  1236. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
  1237. if ($cacheFile && @is_file($cacheFile)) {
  1238. $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1239. return 2;
  1240. }
  1241. // init UTF-8 conversion for this charset
  1242. if (!$this->initCharset($charset)) {
  1243. return FALSE;
  1244. }
  1245. // UTF-8 case folding is used as the base conversion table
  1246. if (!$this->initUnicodeData('case')) {
  1247. return FALSE;
  1248. }
  1249. $nochar = chr($this->noCharByteVal);
  1250. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1251. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1252. $c = $this->utf8_decode($utf8, $charset);
  1253. // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
  1254. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
  1255. if ($cc != '' && $cc != $nochar) {
  1256. $this->caseFolding[$charset]['toUpper'][$c] = $cc;
  1257. }
  1258. // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
  1259. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
  1260. if ($cc != '' && $cc != $nochar) {
  1261. $this->caseFolding[$charset]['toLower'][$c] = $cc;
  1262. }
  1263. // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
  1264. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
  1265. if ($cc != '' && $cc != $nochar) {
  1266. $this->caseFolding[$charset]['toTitle'][$c] = $cc;
  1267. }
  1268. }
  1269. // add the ASCII case table
  1270. for ($i = ord('a'); $i <= ord('z'); $i++) {
  1271. $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
  1272. }
  1273. for ($i = ord('A'); $i <= ord('Z'); $i++) {
  1274. $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
  1275. }
  1276. if ($cacheFile) {
  1277. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
  1278. }
  1279. return 3;
  1280. }
  1281. /**
  1282. * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
  1283. * This function is automatically called by the ASCII transliteration functions.
  1284. *
  1285. * @param string Charset for which to initialize conversion.
  1286. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1287. * @access private
  1288. */
  1289. function initToASCII($charset) {
  1290. // Only process if the case table is not yet loaded:
  1291. if (is_array($this->toASCII[$charset])) {
  1292. return 1;
  1293. }
  1294. // Use cached version if possible
  1295. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
  1296. if ($cacheFile && @is_file($cacheFile)) {
  1297. $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1298. return 2;
  1299. }
  1300. // init UTF-8 conversion for this charset
  1301. if (!$this->initCharset($charset)) {
  1302. return FALSE;
  1303. }
  1304. // UTF-8/ASCII transliteration is used as the base conversion table
  1305. if (!$this->initUnicodeData('ascii')) {
  1306. return FALSE;
  1307. }
  1308. $nochar = chr($this->noCharByteVal);
  1309. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1310. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1311. $c = $this->utf8_decode($utf8, $charset);
  1312. if (isset($this->toASCII['utf-8'][$utf8])) {
  1313. $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
  1314. }
  1315. }
  1316. if ($cacheFile) {
  1317. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
  1318. }
  1319. return 3;
  1320. }
  1321. /********************************************
  1322. *
  1323. * String operation functions
  1324. *
  1325. ********************************************/
  1326. /**
  1327. * Returns a part of a string.
  1328. * Unit-tested by Kasper (single byte charsets only)
  1329. *
  1330. * @param string The character set
  1331. * @param string Character string
  1332. * @param integer Start position (character position)
  1333. * @param integer Length (in characters)
  1334. * @return string The substring
  1335. * @see substr(), mb_substr()
  1336. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1337. */
  1338. function substr($charset, $string, $start, $len = NULL) {
  1339. if ($len === 0 || $string === '') {
  1340. return '';
  1341. }
  1342. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1343. // cannot omit $len, when specifying charset
  1344. if ($len == NULL) {
  1345. $enc = mb_internal_encoding(); // save internal encoding
  1346. mb_internal_encoding($charset);
  1347. $str = mb_substr($string, $start);
  1348. mb_internal_encoding($enc); // restore internal encoding
  1349. return $str;
  1350. }
  1351. else {
  1352. return mb_substr($string, $start, $len, $charset);
  1353. }
  1354. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1355. // cannot omit $len, when specifying charset
  1356. if ($len == NULL) {
  1357. $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
  1358. iconv_set_encoding('internal_encoding', $charset);
  1359. $str = iconv_substr($string, $start);
  1360. iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
  1361. return $str;
  1362. }
  1363. else {
  1364. return iconv_substr($string, $start, $len, $charset);
  1365. }
  1366. } elseif ($charset == 'utf-8') {
  1367. return $this->utf8_substr($string, $start, $len);
  1368. } elseif ($this->eucBasedSets[$charset]) {
  1369. return $this->euc_substr($string, $start, $charset, $len);
  1370. } elseif ($this->twoByteSets[$charset]) {
  1371. return substr($string, $start * 2, $len * 2);
  1372. } elseif ($this->fourByteSets[$charset]) {
  1373. return substr($string, $start * 4, $len * 4);
  1374. }
  1375. // treat everything else as single-byte encoding
  1376. return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
  1377. }
  1378. /**
  1379. * Counts the number of characters.
  1380. * Unit-tested by Kasper (single byte charsets only)
  1381. *
  1382. * @param string The character set
  1383. * @param string Character string
  1384. * @return integer The number of characters
  1385. * @see strlen()
  1386. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1387. */
  1388. function strlen($charset, $string) {
  1389. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1390. return mb_strlen($string, $charset);
  1391. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1392. return iconv_strlen($string, $charset);
  1393. } elseif ($charset == 'utf-8') {
  1394. return $this->utf8_strlen($string);
  1395. } elseif ($this->eucBasedSets[$charset]) {
  1396. return $this->euc_strlen($string, $charset);
  1397. } elseif ($this->twoByteSets[$charset]) {
  1398. return strlen($string) / 2;