PageRenderTime 60ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/moodle/lib/typo3/class.t3lib_cs.php

#
PHP | 2348 lines | 1521 code | 212 blank | 615 comment | 374 complexity | 5c4344bd2d09886d246b3632c4de121d MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause, AGPL-3.0, MPL-2.0-no-copyleft-exception, LGPL-3.0, Apache-2.0
  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2003-2011 Kasper Skĺrhřj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the Typo3 project. The Typo3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. *
  17. * This script is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * This copyright notice MUST APPEAR in all copies of the script!
  23. ***************************************************************/
  24. /**
  25. * Class for conversion between charsets.
  26. *
  27. * $Id: class.t3lib_cs.php,v 1.15 2011/10/27 17:26:27 moodlerobot Exp $
  28. *
  29. * @author Kasper Skĺrhřj <kasperYYYY@typo3.com>
  30. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  31. */
  32. /**
  33. * [CLASS/FUNCTION INDEX of SCRIPT]
  34. *
  35. *
  36. *
  37. * 136: class t3lib_cs
  38. * 488: function parse_charset($charset)
  39. * 507: function get_locale_charset($locale)
  40. *
  41. * SECTION: Charset Conversion functions
  42. * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  43. * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  44. * 617: function utf8_encode($str,$charset)
  45. * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
  46. * 706: function utf8_to_entities($str)
  47. * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  48. * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  49. * 823: function UnumberToChar($cbyte)
  50. * 868: function utf8CharToUnumber($str,$hex=0)
  51. *
  52. * SECTION: Init functions
  53. * 911: function initCharset($charset)
  54. * 973: function initUnicodeData($mode=null)
  55. * 1198: function initCaseFolding($charset)
  56. * 1260: function initToASCII($charset)
  57. *
  58. * SECTION: String operation functions
  59. * 1331: function substr($charset,$string,$start,$len=null)
  60. * 1384: function strlen($charset,$string)
  61. * 1414: function crop($charset,$string,$len,$crop='')
  62. * 1467: function strtrunc($charset,$string,$len)
  63. * 1501: function conv_case($charset,$string,$case)
  64. * 1527: function specCharsToASCII($charset,$string)
  65. *
  66. * SECTION: Internal string operation functions
  67. * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
  68. *
  69. * SECTION: Internal UTF-8 string operation functions
  70. * 1622: function utf8_substr($str,$start,$len=null)
  71. * 1655: function utf8_strlen($str)
  72. * 1676: function utf8_strtrunc($str,$len)
  73. * 1698: function utf8_strpos($haystack,$needle,$offset=0)
  74. * 1723: function utf8_strrpos($haystack,$needle)
  75. * 1745: function utf8_char2byte_pos($str,$pos)
  76. * 1786: function utf8_byte2char_pos($str,$pos)
  77. * 1809: function utf8_char_mapping($str,$mode,$opt='')
  78. *
  79. * SECTION: Internal EUC string operation functions
  80. * 1885: function euc_strtrunc($str,$len,$charset)
  81. * 1914: function euc_substr($str,$start,$charset,$len=null)
  82. * 1939: function euc_strlen($str,$charset)
  83. * 1966: function euc_char2byte_pos($str,$pos,$charset)
  84. * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
  85. *
  86. * TOTAL FUNCTIONS: 35
  87. * (This index is automatically created/updated by the extension "extdeveval")
  88. *
  89. */
  90. /**
  91. * Notes on UTF-8
  92. *
  93. * Functions working on UTF-8 strings:
  94. *
  95. * - strchr/strstr
  96. * - strrchr
  97. * - substr_count
  98. * - implode/explode/join
  99. *
  100. * Functions nearly working on UTF-8 strings:
  101. *
  102. * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  103. * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  104. * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  105. * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  106. * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  107. *
  108. * Functions NOT working on UTF-8 strings:
  109. *
  110. * - str*cmp
  111. * - stristr
  112. * - stripos
  113. * - substr
  114. * - strrev
  115. * - split/spliti
  116. * - ...
  117. *
  118. */
  119. /**
  120. * Class for conversion between charsets
  121. *
  122. * @author Kasper Skĺrhřj <kasperYYYY@typo3.com>
  123. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  124. * @package TYPO3
  125. * @subpackage t3lib
  126. */
  127. class t3lib_cs {
  128. var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  129. // This is the array where parsed conversion tables are stored (cached)
  130. var $parsedCharsets = array();
  131. // An array where case folding data will be stored (cached)
  132. var $caseFolding = array();
  133. // An array where charset-to-ASCII mappings are stored (cached)
  134. var $toASCII = array();
  135. // This tells the converter which charsets has two bytes per char:
  136. var $twoByteSets = array(
  137. 'ucs-2' => 1, // 2-byte Unicode
  138. );
  139. // This tells the converter which charsets has four bytes per char:
  140. var $fourByteSets = array(
  141. 'ucs-4' => 1, // 4-byte Unicode
  142. 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  143. );
  144. // This tells the converter which charsets use a scheme like the Extended Unix Code:
  145. var $eucBasedSets = array(
  146. 'gb2312' => 1, // Chinese, simplified.
  147. 'big5' => 1, // Chinese, traditional.
  148. 'euc-kr' => 1, // Korean
  149. 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
  150. );
  151. // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
  152. // http://czyborra.com/charsets/iso8859.html
  153. var $synonyms = array(
  154. 'us' => 'ascii',
  155. 'us-ascii' => 'ascii',
  156. 'cp819' => 'iso-8859-1',
  157. 'ibm819' => 'iso-8859-1',
  158. 'iso-ir-100' => 'iso-8859-1',
  159. 'iso-ir-101' => 'iso-8859-2',
  160. 'iso-ir-109' => 'iso-8859-3',
  161. 'iso-ir-110' => 'iso-8859-4',
  162. 'iso-ir-144' => 'iso-8859-5',
  163. 'iso-ir-127' => 'iso-8859-6',
  164. 'iso-ir-126' => 'iso-8859-7',
  165. 'iso-ir-138' => 'iso-8859-8',
  166. 'iso-ir-148' => 'iso-8859-9',
  167. 'iso-ir-157' => 'iso-8859-10',
  168. 'iso-ir-179' => 'iso-8859-13',
  169. 'iso-ir-199' => 'iso-8859-14',
  170. 'iso-ir-203' => 'iso-8859-15',
  171. 'csisolatin1' => 'iso-8859-1',
  172. 'csisolatin2' => 'iso-8859-2',
  173. 'csisolatin3' => 'iso-8859-3',
  174. 'csisolatin5' => 'iso-8859-9',
  175. 'csisolatin8' => 'iso-8859-14',
  176. 'csisolatin9' => 'iso-8859-15',
  177. 'csisolatingreek' => 'iso-8859-7',
  178. 'iso-celtic' => 'iso-8859-14',
  179. 'latin1' => 'iso-8859-1',
  180. 'latin2' => 'iso-8859-2',
  181. 'latin3' => 'iso-8859-3',
  182. 'latin5' => 'iso-8859-9',
  183. 'latin6' => 'iso-8859-10',
  184. 'latin8' => 'iso-8859-14',
  185. 'latin9' => 'iso-8859-15',
  186. 'l1' => 'iso-8859-1',
  187. 'l2' => 'iso-8859-2',
  188. 'l3' => 'iso-8859-3',
  189. 'l5' => 'iso-8859-9',
  190. 'l6' => 'iso-8859-10',
  191. 'l8' => 'iso-8859-14',
  192. 'l9' => 'iso-8859-15',
  193. 'cyrillic' => 'iso-8859-5',
  194. 'arabic' => 'iso-8859-6',
  195. 'tis-620' => 'iso-8859-11',
  196. 'win874' => 'windows-874',
  197. 'win1250' => 'windows-1250',
  198. 'win1251' => 'windows-1251',
  199. 'win1252' => 'windows-1252',
  200. 'win1253' => 'windows-1253',
  201. 'win1254' => 'windows-1254',
  202. 'win1255' => 'windows-1255',
  203. 'win1256' => 'windows-1256',
  204. 'win1257' => 'windows-1257',
  205. 'win1258' => 'windows-1258',
  206. 'cp1250' => 'windows-1250',
  207. 'cp1251' => 'windows-1251',
  208. 'cp1252' => 'windows-1252',
  209. 'ms-ee' => 'windows-1250',
  210. 'ms-ansi' => 'windows-1252',
  211. 'ms-greek' => 'windows-1253',
  212. 'ms-turk' => 'windows-1254',
  213. 'winbaltrim' => 'windows-1257',
  214. 'koi-8ru' => 'koi-8r',
  215. 'koi8r' => 'koi-8r',
  216. 'cp878' => 'koi-8r',
  217. 'mac' => 'macroman',
  218. 'macintosh' => 'macroman',
  219. 'euc-cn' => 'gb2312',
  220. 'x-euc-cn' => 'gb2312',
  221. 'euccn' => 'gb2312',
  222. 'cp936' => 'gb2312',
  223. 'big-5' => 'big5',
  224. 'cp950' => 'big5',
  225. 'eucjp' => 'euc-jp',
  226. 'sjis' => 'shift_jis',
  227. 'shift-jis' => 'shift_jis',
  228. 'cp932' => 'shift_jis',
  229. 'cp949' => 'euc-kr',
  230. 'utf7' => 'utf-7',
  231. 'utf8' => 'utf-8',
  232. 'utf16' => 'utf-16',
  233. 'utf32' => 'utf-32',
  234. 'utf8' => 'utf-8',
  235. 'ucs2' => 'ucs-2',
  236. 'ucs4' => 'ucs-4',
  237. );
  238. // mapping of iso-639-1 language codes to script names
  239. var $lang_to_script = array(
  240. // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
  241. 'ar' => 'arabic',
  242. 'bg' => 'cyrillic', // Bulgarian
  243. 'bs' => 'east_european', // Bosnian
  244. 'cs' => 'east_european', // Czech
  245. 'da' => 'west_european', // Danish
  246. 'de' => 'west_european', // German
  247. 'es' => 'west_european', // Spanish
  248. 'et' => 'estonian',
  249. 'eo' => 'unicode', // Esperanto
  250. 'eu' => 'west_european', // Basque
  251. 'fa' => 'arabic', // Persian
  252. 'fi' => 'west_european', // Finish
  253. 'fo' => 'west_european', // Faroese
  254. 'fr' => 'west_european', // French
  255. 'ga' => 'west_european', // Irish
  256. 'gl' => 'west_european', // Galician
  257. 'gr' => 'greek',
  258. 'he' => 'hebrew', // Hebrew (since 1998)
  259. 'hi' => 'unicode', // Hindi
  260. 'hr' => 'east_european', // Croatian
  261. 'hu' => 'east_european', // Hungarian
  262. 'iw' => 'hebrew', // Hebrew (til 1998)
  263. 'is' => 'west_european', // Icelandic
  264. 'it' => 'west_european', // Italian
  265. 'ja' => 'japanese',
  266. 'ka' => 'unicode', // Georgian
  267. 'kl' => 'west_european', // Greenlandic
  268. 'km' => 'unicode', // Khmer
  269. 'ko' => 'korean',
  270. 'lt' => 'lithuanian',
  271. 'lv' => 'west_european', // Latvian/Lettish
  272. 'nl' => 'west_european', // Dutch
  273. 'no' => 'west_european', // Norwegian
  274. 'nb' => 'west_european', // Norwegian Bokmal
  275. 'nn' => 'west_european', // Norwegian Nynorsk
  276. 'pl' => 'east_european', // Polish
  277. 'pt' => 'west_european', // Portuguese
  278. 'ro' => 'east_european', // Romanian
  279. 'ru' => 'cyrillic', // Russian
  280. 'sk' => 'east_european', // Slovak
  281. 'sl' => 'east_european', // Slovenian
  282. 'sr' => 'cyrillic', // Serbian
  283. 'sv' => 'west_european', // Swedish
  284. 'sq' => 'albanian', // Albanian
  285. 'th' => 'thai',
  286. 'uk' => 'cyrillic', // Ukranian
  287. 'vi' => 'vietnamese',
  288. 'zh' => 'chinese',
  289. // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
  290. // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
  291. 'ara' => 'arabic',
  292. 'bgr' => 'cyrillic', // Bulgarian
  293. 'cat' => 'west_european', // Catalan
  294. 'chs' => 'simpl_chinese',
  295. 'cht' => 'trad_chinese',
  296. 'csy' => 'east_european', // Czech
  297. 'dan' => 'west_european', // Danisch
  298. 'deu' => 'west_european', // German
  299. 'dea' => 'west_european', // German (Austrian)
  300. 'des' => 'west_european', // German (Swiss)
  301. 'ena' => 'west_european', // English (Australian)
  302. 'enc' => 'west_european', // English (Canadian)
  303. 'eng' => 'west_european', // English
  304. 'enz' => 'west_european', // English (New Zealand)
  305. 'enu' => 'west_european', // English (United States)
  306. 'euq' => 'west_european', // Basque
  307. 'fos' => 'west_european', // Faroese
  308. 'far' => 'arabic', // Persian
  309. 'fin' => 'west_european', // Finish
  310. 'fra' => 'west_european', // French
  311. 'frb' => 'west_european', // French (Belgian)
  312. 'frc' => 'west_european', // French (Canadian)
  313. 'frs' => 'west_european', // French (Swiss)
  314. 'geo' => 'unicode', // Georgian
  315. 'glg' => 'west_european', // Galician
  316. 'ell' => 'greek',
  317. 'heb' => 'hebrew',
  318. 'hin' => 'unicode', // Hindi
  319. 'hun' => 'east_european', // Hungarian
  320. 'isl' => 'west_euorpean', // Icelandic
  321. 'ita' => 'west_european', // Italian
  322. 'its' => 'west_european', // Italian (Swiss)
  323. 'jpn' => 'japanese',
  324. 'khm' => 'unicode', // Khmer
  325. 'kor' => 'korean',
  326. 'lth' => 'lithuanian',
  327. 'lvi' => 'west_european', // Latvian/Lettish
  328. 'msl' => 'west_european', // Malay
  329. 'nlb' => 'west_european', // Dutch (Belgian)
  330. 'nld' => 'west_european', // Dutch
  331. 'nor' => 'west_european', // Norwegian (bokmal)
  332. 'non' => 'west_european', // Norwegian (nynorsk)
  333. 'plk' => 'east_european', // Polish
  334. 'ptg' => 'west_european', // Portuguese
  335. 'ptb' => 'west_european', // Portuguese (Brazil)
  336. 'rom' => 'east_european', // Romanian
  337. 'rus' => 'cyrillic', // Russian
  338. 'slv' => 'east_european', // Slovenian
  339. 'sky' => 'east_european', // Slovak
  340. 'srl' => 'east_european', // Serbian (Latin)
  341. 'srb' => 'cyrillic', // Serbian (Cyrillic)
  342. 'esp' => 'west_european', // Spanish (trad. sort)
  343. 'esm' => 'west_european', // Spanish (Mexican)
  344. 'esn' => 'west_european', // Spanish (internat. sort)
  345. 'sve' => 'west_european', // Swedish
  346. 'sqi' => 'albanian', // Albanian
  347. 'tha' => 'thai',
  348. 'trk' => 'turkish',
  349. 'ukr' => 'cyrillic', // Ukrainian
  350. // English language names
  351. 'albanian' => 'albanian',
  352. 'arabic' => 'arabic',
  353. 'basque' => 'west_european',
  354. 'bosnian' => 'east_european',
  355. 'bulgarian' => 'east_european',
  356. 'catalan' => 'west_european',
  357. 'croatian' => 'east_european',
  358. 'czech' => 'east_european',
  359. 'danish' => 'west_european',
  360. 'dutch' => 'west_european',
  361. 'english' => 'west_european',
  362. 'esperanto' => 'unicode',
  363. 'estonian' => 'estonian',
  364. 'faroese' => 'west_european',
  365. 'farsi' => 'arabic',
  366. 'finnish' => 'west_european',
  367. 'french' => 'west_european',
  368. 'galician' => 'west_european',
  369. 'georgian' => 'unicode',
  370. 'german' => 'west_european',
  371. 'greek' => 'greek',
  372. 'greenlandic' => 'west_european',
  373. 'hebrew' => 'hebrew',
  374. 'hindi' => 'unicode',
  375. 'hungarian' => 'east_european',
  376. 'icelandic' => 'west_european',
  377. 'italian' => 'west_european',
  378. 'khmer' => 'unicode',
  379. 'latvian' => 'west_european',
  380. 'lettish' => 'west_european',
  381. 'lithuanian' => 'lithuanian',
  382. 'malay' => 'west_european',
  383. 'norwegian' => 'west_european',
  384. 'persian' => 'arabic',
  385. 'polish' => 'east_european',
  386. 'portuguese' => 'west_european',
  387. 'russian' => 'cyrillic',
  388. 'romanian' => 'east_european',
  389. 'serbian' => 'cyrillic',
  390. 'slovak' => 'east_european',
  391. 'slovenian' => 'east_european',
  392. 'spanish' => 'west_european',
  393. 'svedish' => 'west_european',
  394. 'that' => 'thai',
  395. 'turkish' => 'turkish',
  396. 'ukrainian' => 'cyrillic',
  397. );
  398. // mapping of language (family) names to charsets on Unix
  399. var $script_to_charset_unix = array(
  400. 'west_european' => 'iso-8859-1',
  401. 'estonian' => 'iso-8859-1',
  402. 'east_european' => 'iso-8859-2',
  403. 'baltic' => 'iso-8859-4',
  404. 'cyrillic' => 'iso-8859-5',
  405. 'arabic' => 'iso-8859-6',
  406. 'greek' => 'iso-8859-7',
  407. 'hebrew' => 'iso-8859-8',
  408. 'turkish' => 'iso-8859-9',
  409. 'thai' => 'iso-8859-11', // = TIS-620
  410. 'lithuanian' => 'iso-8859-13',
  411. 'chinese' => 'gb2312', // = euc-cn
  412. 'japanese' => 'euc-jp',
  413. 'korean' => 'euc-kr',
  414. 'simpl_chinese' => 'gb2312',
  415. 'trad_chinese' => 'big5',
  416. 'vietnamese' => '',
  417. 'unicode' => 'utf-8',
  418. 'albanian' => 'utf-8'
  419. );
  420. // mapping of language (family) names to charsets on Windows
  421. var $script_to_charset_windows = array(
  422. 'east_european' => 'windows-1250',
  423. 'cyrillic' => 'windows-1251',
  424. 'west_european' => 'windows-1252',
  425. 'greek' => 'windows-1253',
  426. 'turkish' => 'windows-1254',
  427. 'hebrew' => 'windows-1255',
  428. 'arabic' => 'windows-1256',
  429. 'baltic' => 'windows-1257',
  430. 'estonian' => 'windows-1257',
  431. 'lithuanian' => 'windows-1257',
  432. 'vietnamese' => 'windows-1258',
  433. 'thai' => 'cp874',
  434. 'korean' => 'cp949',
  435. 'chinese' => 'gb2312',
  436. 'japanese' => 'shift_jis',
  437. 'simpl_chinese' => 'gb2312',
  438. 'trad_chinese' => 'big5',
  439. 'albanian' => 'windows-1250',
  440. 'unicode' => 'utf-8'
  441. );
  442. // mapping of locale names to charsets
  443. var $locale_to_charset = array(
  444. 'japanese.euc' => 'euc-jp',
  445. 'ja_jp.ujis' => 'euc-jp',
  446. 'korean.euc' => 'euc-kr',
  447. 'sr@Latn' => 'iso-8859-2',
  448. 'zh_cn' => 'gb2312',
  449. 'zh_hk' => 'big5',
  450. 'zh_tw' => 'big5',
  451. );
  452. // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
  453. // Empty values means "iso-8859-1"
  454. var $charSetArray = array(
  455. 'dk' => '',
  456. 'de' => '',
  457. 'no' => '',
  458. 'it' => '',
  459. 'fr' => '',
  460. 'es' => '',
  461. 'nl' => '',
  462. 'cz' => 'windows-1250',
  463. 'pl' => 'iso-8859-2',
  464. 'si' => 'windows-1250',
  465. 'fi' => '',
  466. 'tr' => 'iso-8859-9',
  467. 'se' => '',
  468. 'pt' => '',
  469. 'ru' => 'windows-1251',
  470. 'ro' => 'iso-8859-2',
  471. 'ch' => 'gb2312',
  472. 'sk' => 'windows-1250',
  473. 'lt' => 'windows-1257',
  474. 'is' => 'utf-8',
  475. 'hr' => 'windows-1250',
  476. 'hu' => 'iso-8859-2',
  477. 'gl' => '',
  478. 'th' => 'iso-8859-11',
  479. 'gr' => 'iso-8859-7',
  480. 'hk' => 'big5',
  481. 'eu' => '',
  482. 'bg' => 'windows-1251',
  483. 'br' => '',
  484. 'et' => 'iso-8859-4',
  485. 'ar' => 'iso-8859-6',
  486. 'he' => 'utf-8',
  487. 'ua' => 'windows-1251',
  488. 'jp' => 'shift_jis',
  489. 'lv' => 'utf-8',
  490. 'vn' => 'utf-8',
  491. 'ca' => 'iso-8859-15',
  492. 'ba' => 'iso-8859-2',
  493. 'kr' => 'euc-kr',
  494. 'eo' => 'utf-8',
  495. 'my' => '',
  496. 'hi' => 'utf-8',
  497. 'fo' => 'utf-8',
  498. 'fa' => 'utf-8',
  499. 'sr' => 'utf-8',
  500. 'sq' => 'utf-8',
  501. 'ge' => 'utf-8',
  502. 'ga' => '',
  503. 'km' => 'utf-8',
  504. 'qc' => '',
  505. );
  506. // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
  507. // Missing keys means: same as Typo3
  508. var $isoArray = array(
  509. 'ba' => 'bs',
  510. 'br' => 'pt_BR',
  511. 'ch' => 'zh_CN',
  512. 'cz' => 'cs',
  513. 'dk' => 'da',
  514. 'si' => 'sl',
  515. 'se' => 'sv',
  516. 'gl' => 'kl',
  517. 'gr' => 'el',
  518. 'hk' => 'zh_HK',
  519. 'kr' => 'ko',
  520. 'ua' => 'uk',
  521. 'jp' => 'ja',
  522. 'qc' => 'fr_CA',
  523. 'vn' => 'vi',
  524. 'ge' => 'ka',
  525. 'ga' => 'gl',
  526. );
  527. /**
  528. * Normalize - changes input character set to lowercase letters.
  529. *
  530. * @param string Input charset
  531. * @return string Normalized charset
  532. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  533. */
  534. function parse_charset($charset) {
  535. $charset = trim(strtolower($charset));
  536. if (isset($this->synonyms[$charset])) {
  537. $charset = $this->synonyms[$charset];
  538. }
  539. return $charset;
  540. }
  541. /**
  542. * Get the charset of a locale.
  543. *
  544. * ln language
  545. * ln_CN language / country
  546. * ln_CN.cs language / country / charset
  547. * ln_CN.cs@mod language / country / charset / modifier
  548. *
  549. * @param string Locale string
  550. * @return string Charset resolved for locale string
  551. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  552. */
  553. function get_locale_charset($locale) {
  554. $locale = strtolower($locale);
  555. // exact locale specific charset?
  556. if (isset($this->locale_to_charset[$locale])) {
  557. return $this->locale_to_charset[$locale];
  558. }
  559. // get modifier
  560. list($locale, $modifier) = explode('@', $locale);
  561. // locale contains charset: use it
  562. list($locale, $charset) = explode('.', $locale);
  563. if ($charset) {
  564. return $this->parse_charset($charset);
  565. }
  566. // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
  567. if ($modifier == 'euro') {
  568. return 'iso-8859-15';
  569. }
  570. // get language
  571. list($language, $country) = explode('_', $locale);
  572. if (isset($this->lang_to_script[$language])) {
  573. $script = $this->lang_to_script[$language];
  574. }
  575. if (TYPO3_OS == 'WIN') {
  576. $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
  577. } else {
  578. $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
  579. }
  580. return $cs;
  581. }
  582. /********************************************
  583. *
  584. * Charset Conversion functions
  585. *
  586. ********************************************/
  587. /**
  588. * Convert from one charset to another charset.
  589. *
  590. * @param string Input string
  591. * @param string From charset (the current charset of the string)
  592. * @param string To charset (the output charset wanted)
  593. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  594. * @return string Converted string
  595. * @see convArray()
  596. */
  597. function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
  598. if ($fromCS == $toCS) {
  599. return $str;
  600. }
  601. // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
  602. if ($toCS == 'utf-8' || !$useEntityForNoChar) {
  603. switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
  604. case 'mbstring':
  605. $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
  606. if (FALSE !== $conv_str) {
  607. return $conv_str;
  608. } // returns false for unsupported charsets
  609. break;
  610. case 'iconv':
  611. $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
  612. if (FALSE !== $conv_str) {
  613. return $conv_str;
  614. }
  615. break;
  616. case 'recode':
  617. $conv_str = recode_string($fromCS . '..' . $toCS, $str);
  618. if (FALSE !== $conv_str) {
  619. return $conv_str;
  620. }
  621. break;
  622. }
  623. // fallback to TYPO3 conversion
  624. }
  625. if ($fromCS != 'utf-8') {
  626. $str = $this->utf8_encode($str, $fromCS);
  627. }
  628. if ($toCS != 'utf-8') {
  629. $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
  630. }
  631. return $str;
  632. }
  633. /**
  634. * Convert all elements in ARRAY with type string from one charset to another charset.
  635. * NOTICE: Array is passed by reference!
  636. *
  637. * @param string Input array, possibly multidimensional
  638. * @param string From charset (the current charset of the string)
  639. * @param string To charset (the output charset wanted)
  640. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  641. * @return void
  642. * @see conv()
  643. */
  644. function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
  645. foreach ($array as $key => $value) {
  646. if (is_array($array[$key])) {
  647. $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  648. } elseif (is_string($array[$key])) {
  649. $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
  650. }
  651. }
  652. }
  653. /**
  654. * Converts $str from $charset to UTF-8
  655. *
  656. * @param string String in local charset to convert to UTF-8
  657. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  658. * @return string Output string, converted to UTF-8
  659. */
  660. function utf8_encode($str, $charset) {
  661. if ($charset === 'utf-8') {
  662. return $str;
  663. }
  664. // Charset is case-insensitive.
  665. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  666. $strLen = strlen($str);
  667. $outStr = '';
  668. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
  669. $chr = substr($str, $a, 1);
  670. $ord = ord($chr);
  671. if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
  672. $ord2 = ord($str{$a + 1});
  673. $ord = $ord << 8 | $ord2; // assume big endian
  674. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  675. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  676. } else {
  677. $outStr .= chr($this->noCharByteVal);
  678. } // No char exists
  679. $a++;
  680. } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
  681. if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
  682. if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
  683. $a++;
  684. $ord2 = ord(substr($str, $a, 1));
  685. $ord = $ord * 256 + $ord2;
  686. }
  687. }
  688. if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
  689. $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
  690. } else {
  691. $outStr .= chr($this->noCharByteVal);
  692. } // No char exists
  693. } else {
  694. $outStr .= $chr;
  695. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  696. }
  697. return $outStr;
  698. }
  699. }
  700. /**
  701. * Converts $str from UTF-8 to $charset
  702. *
  703. * @param string String in UTF-8 to convert to local charset
  704. * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
  705. * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
  706. * @return string Output string, converted to local charset
  707. */
  708. function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
  709. if ($charset === 'utf-8') {
  710. return $str;
  711. }
  712. // Charset is case-insensitive.
  713. if ($this->initCharset($charset)) { // Parse conv. table if not already...
  714. $strLen = strlen($str);
  715. $outStr = '';
  716. $buf = '';
  717. for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
  718. $chr = substr($str, $a, 1);
  719. $ord = ord($chr);
  720. if ($ord > 127) { // This means multibyte! (first byte!)
  721. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  722. $buf = $chr; // Add first byte
  723. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  724. $ord = $ord << 1; // Shift it left and ...
  725. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  726. $a++; // Increase pointer...
  727. $buf .= substr($str, $a, 1); // ... and add the next char.
  728. } else {
  729. break;
  730. }
  731. }
  732. if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
  733. $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
  734. if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
  735. $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
  736. } else {
  737. $outStr .= chr($mByte);
  738. }
  739. } elseif ($useEntityForNoChar) { // Create num entity:
  740. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  741. } else {
  742. $outStr .= chr($this->noCharByteVal);
  743. } // No char exists
  744. } else {
  745. $outStr .= chr($this->noCharByteVal);
  746. } // No char exists (MIDDLE of MB sequence!)
  747. } else {
  748. $outStr .= $chr;
  749. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  750. }
  751. return $outStr;
  752. }
  753. }
  754. /**
  755. * Converts all chars > 127 to numeric entities.
  756. *
  757. * @param string Input string
  758. * @return string Output string
  759. */
  760. function utf8_to_entities($str) {
  761. $strLen = strlen($str);
  762. $outStr = '';
  763. $buf = '';
  764. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  765. $chr = substr($str, $a, 1);
  766. $ord = ord($chr);
  767. if ($ord > 127) { // This means multibyte! (first byte!)
  768. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  769. $buf = $chr; // Add first byte
  770. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  771. $ord = $ord << 1; // Shift it left and ...
  772. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  773. $a++; // Increase pointer...
  774. $buf .= substr($str, $a, 1); // ... and add the next char.
  775. } else {
  776. break;
  777. }
  778. }
  779. $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
  780. } else {
  781. $outStr .= chr($this->noCharByteVal);
  782. } // No char exists (MIDDLE of MB sequence!)
  783. } else {
  784. $outStr .= $chr;
  785. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  786. }
  787. return $outStr;
  788. }
  789. /**
  790. * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
  791. *
  792. * @param string Input string, UTF-8
  793. * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
  794. * @return string Output string
  795. */
  796. function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
  797. if ($alsoStdHtmlEnt) {
  798. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
  799. }
  800. $token = md5(microtime());
  801. $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
  802. foreach ($parts as $k => $v) {
  803. if ($k % 2) {
  804. if (substr($v, 0, 1) == '#') { // Dec or hex entities:
  805. if (substr($v, 1, 1) == 'x') {
  806. $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
  807. } else {
  808. $parts[$k] = $this->UnumberToChar(substr($v, 1));
  809. }
  810. } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
  811. $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
  812. } else { // No conversion:
  813. $parts[$k] = '&' . $v . ';';
  814. }
  815. }
  816. }
  817. return implode('', $parts);
  818. }
  819. /**
  820. * Converts all chars in the input UTF-8 string into integer numbers returned in an array
  821. *
  822. * @param string Input string, UTF-8
  823. * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
  824. * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
  825. * @return array Output array with the char numbers
  826. */
  827. function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
  828. // If entities must be registered as well...:
  829. if ($convEntities) {
  830. $str = $this->entities_to_utf8($str, 1);
  831. }
  832. // Do conversion:
  833. $strLen = strlen($str);
  834. $outArr = array();
  835. $buf = '';
  836. for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
  837. $chr = substr($str, $a, 1);
  838. $ord = ord($chr);
  839. if ($ord > 127) { // This means multibyte! (first byte!)
  840. if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
  841. $buf = $chr; // Add first byte
  842. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  843. $ord = $ord << 1; // Shift it left and ...
  844. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  845. $a++; // Increase pointer...
  846. $buf .= substr($str, $a, 1); // ... and add the next char.
  847. } else {
  848. break;
  849. }
  850. }
  851. $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
  852. } else {
  853. $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
  854. } // No char exists (MIDDLE of MB sequence!)
  855. } else {
  856. $outArr[] = $retChar ? chr($ord) : $ord;
  857. } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
  858. }
  859. return $outArr;
  860. }
  861. /**
  862. * Converts a UNICODE number to a UTF-8 multibyte character
  863. * Algorithm based on script found at From: http://czyborra.com/utf/
  864. * Unit-tested by Kasper
  865. *
  866. * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
  867. *
  868. * bytes | bits | representation
  869. * 1 | 7 | 0vvvvvvv
  870. * 2 | 11 | 110vvvvv 10vvvvvv
  871. * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
  872. * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  873. * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  874. * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
  875. *
  876. * @param integer UNICODE integer
  877. * @return string UTF-8 multibyte character string
  878. * @see utf8CharToUnumber()
  879. */
  880. function UnumberToChar($cbyte) {
  881. $str = '';
  882. if ($cbyte < 0x80) {
  883. $str .= chr($cbyte);
  884. } else {
  885. if ($cbyte < 0x800) {
  886. $str .= chr(0xC0 | ($cbyte >> 6));
  887. $str .= chr(0x80 | ($cbyte & 0x3F));
  888. } else {
  889. if ($cbyte < 0x10000) {
  890. $str .= chr(0xE0 | ($cbyte >> 12));
  891. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  892. $str .= chr(0x80 | ($cbyte & 0x3F));
  893. } else {
  894. if ($cbyte < 0x200000) {
  895. $str .= chr(0xF0 | ($cbyte >> 18));
  896. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  897. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  898. $str .= chr(0x80 | ($cbyte & 0x3F));
  899. } else {
  900. if ($cbyte < 0x4000000) {
  901. $str .= chr(0xF8 | ($cbyte >> 24));
  902. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  903. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  904. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  905. $str .= chr(0x80 | ($cbyte & 0x3F));
  906. } else {
  907. if ($cbyte < 0x80000000) {
  908. $str .= chr(0xFC | ($cbyte >> 30));
  909. $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
  910. $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
  911. $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
  912. $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
  913. $str .= chr(0x80 | ($cbyte & 0x3F));
  914. } else { // Cannot express a 32-bit character in UTF-8
  915. $str .= chr($this->noCharByteVal);
  916. }
  917. }
  918. }
  919. }
  920. }
  921. }
  922. return $str;
  923. }
  924. /**
  925. * Converts a UTF-8 Multibyte character to a UNICODE number
  926. * Unit-tested by Kasper
  927. *
  928. * @param string UTF-8 multibyte character string
  929. * @param boolean If set, then a hex. number is returned.
  930. * @return integer UNICODE integer
  931. * @see UnumberToChar()
  932. */
  933. function utf8CharToUnumber($str, $hex = 0) {
  934. $ord = ord(substr($str, 0, 1)); // First char
  935. if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
  936. $binBuf = '';
  937. for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
  938. $ord = $ord << 1; // Shift it left and ...
  939. if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
  940. $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
  941. } else {
  942. break;
  943. }
  944. }
  945. $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
  946. $int = bindec($binBuf);
  947. } else {
  948. $int = $ord;
  949. }
  950. return $hex ? 'x' . dechex($int) : $int;
  951. }
  952. /********************************************
  953. *
  954. * Init functions
  955. *
  956. ********************************************/
  957. /**
  958. * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
  959. * This function is automatically called by the conversion functions
  960. *
  961. * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
  962. *
  963. * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
  964. * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
  965. * @access private
  966. */
  967. function initCharset($charset) {
  968. // Only process if the charset is not yet loaded:
  969. if (empty($this->parsedCharsets[$charset]) || !is_array($this->parsedCharsets[$charset])) {
  970. // Conversion table filename:
  971. $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
  972. // If the conversion table is found:
  973. if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
  974. // Cache file for charsets:
  975. // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
  976. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
  977. if ($cacheFile && @is_file($cacheFile)) {
  978. $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  979. } else {
  980. // Parse conversion table into lines:
  981. $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
  982. // Initialize the internal variable holding the conv. table:
  983. $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
  984. // traverse the lines:
  985. $detectedType = '';
  986. foreach ($lines as $value) {
  987. if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
  988. // Detect type if not done yet: (Done on first real line)
  989. // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
  990. if (!$detectedType) {
  991. $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
  992. }
  993. if ($detectedType == 'ms-token') {
  994. list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
  995. } elseif ($detectedType == 'whitespaced') {
  996. $regA = array();
  997. preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
  998. $hexbyte = $regA[1];
  999. $utf8 = 'U+' . $regA[2];
  1000. }
  1001. $decval = hexdec(trim($hexbyte));
  1002. if ($decval > 127) {
  1003. $utf8decval = hexdec(substr(trim($utf8), 2));
  1004. $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
  1005. $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
  1006. }
  1007. }
  1008. }
  1009. if ($cacheFile) {
  1010. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
  1011. }
  1012. }
  1013. return 2;
  1014. } else {
  1015. return FALSE;
  1016. }
  1017. } else {
  1018. return 1;
  1019. }
  1020. }
  1021. /**
  1022. * This function initializes all UTF-8 character data tables.
  1023. *
  1024. * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
  1025. *
  1026. * @param string Mode ("case", "ascii", ...)
  1027. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1028. * @access private
  1029. */
  1030. function initUnicodeData($mode = NULL) {
  1031. // cache files
  1032. $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
  1033. $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
  1034. // Only process if the tables are not yet loaded
  1035. switch ($mode) {
  1036. case 'case':
  1037. if (is_array($this->caseFolding['utf-8'])) {
  1038. return 1;
  1039. }
  1040. // Use cached version if possible
  1041. if ($cacheFileCase && @is_file($cacheFileCase)) {
  1042. $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
  1043. return 2;
  1044. }
  1045. break;
  1046. case 'ascii':
  1047. if (is_array($this->toASCII['utf-8'])) {
  1048. return 1;
  1049. }
  1050. // Use cached version if possible
  1051. if ($cacheFileASCII && @is_file($cacheFileASCII)) {
  1052. $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
  1053. return 2;
  1054. }
  1055. break;
  1056. }
  1057. // process main Unicode data file
  1058. $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
  1059. if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
  1060. return FALSE;
  1061. }
  1062. $fh = fopen($unicodeDataFile, 'rb');
  1063. if (!$fh) {
  1064. return FALSE;
  1065. }
  1066. // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
  1067. // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
  1068. $this->caseFolding['utf-8'] = array();
  1069. $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
  1070. $utf8CaseFolding['toUpper'] = array();
  1071. $utf8CaseFolding['toLower'] = array();
  1072. $utf8CaseFolding['toTitle'] = array();
  1073. $decomposition = array(); // array of temp. decompositions
  1074. $mark = array(); // array of chars that are marks (eg. composing accents)
  1075. $number = array(); // array of chars that are numbers (eg. digits)
  1076. $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
  1077. while (!feof($fh)) {
  1078. $line = fgets($fh, 4096);
  1079. // has a lot of info
  1080. list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
  1081. $ord = hexdec($char);
  1082. if ($ord > 0xFFFF) {
  1083. break;
  1084. } // only process the BMP
  1085. $utf8_char = $this->UnumberToChar($ord);
  1086. if ($upper) {
  1087. $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
  1088. }
  1089. if ($lower) {
  1090. $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
  1091. }
  1092. // store "title" only when different from "upper" (only a few)
  1093. if ($title && $title != $upper) {
  1094. $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
  1095. }
  1096. switch ($cat{0}) {
  1097. case 'M': // mark (accent, umlaut, ...)
  1098. $mark["U+$char"] = 1;
  1099. break;
  1100. case 'N': // numeric value
  1101. if ($ord > 0x80 && $num != '') {
  1102. $number["U+$char"] = $num;
  1103. }
  1104. }
  1105. // accented Latin letters without "official" decomposition
  1106. $match = array();
  1107. if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
  1108. $c = ord($match[2]);
  1109. if ($match[1] == 'SMALL') {
  1110. $c += 32;
  1111. }
  1112. $decomposition["U+$char"] = array(dechex($c));
  1113. continue;
  1114. }
  1115. $match = array();
  1116. if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
  1117. switch ($match[1]) {
  1118. case '<circle>': // add parenthesis as circle replacement, eg (1)
  1119. $match[2] = '0028 ' . $match[2] . ' 0029';
  1120. break;
  1121. case '<square>': // add square brackets as square replacement, eg [1]
  1122. $match[2] = '005B ' . $match[2] . ' 005D';
  1123. break;
  1124. case '<compat>': // ignore multi char decompositions that start with a space
  1125. if (preg_match('/^0020 /', $match[2])) {
  1126. continue 2;
  1127. }
  1128. break;
  1129. // ignore Arabic and vertical layout presentation decomposition
  1130. case '<initial>':
  1131. case '<medial>':
  1132. case '<final>':
  1133. case '<isolated>':
  1134. case '<vertical>':
  1135. continue 2;
  1136. }
  1137. $decomposition["U+$char"] = explode(' ', $match[2]);
  1138. }
  1139. }
  1140. fclose($fh);
  1141. // process additional Unicode data for casing (allow folded characters to expand into a sequence)
  1142. $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
  1143. if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
  1144. $fh = fopen($specialCasingFile, 'rb');
  1145. if ($fh) {
  1146. while (!feof($fh)) {
  1147. $line = fgets($fh, 4096);
  1148. if ($line{0} != '#' && trim($line) != '') {
  1149. list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
  1150. if ($cond == '' || $cond{0} == '#') {
  1151. $utf8_char = $this->UnumberToChar(hexdec($char));
  1152. if ($char != $lower) {
  1153. $arr = explode(' ', $lower);
  1154. for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1155. $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
  1156. }
  1157. if ($char != $title && $title != $upper) {
  1158. $arr = explode(' ', $title);
  1159. for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1160. $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
  1161. }
  1162. if ($char != $upper) {
  1163. $arr = explode(' ', $upper);
  1164. for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
  1165. $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
  1166. }
  1167. }
  1168. }
  1169. }
  1170. fclose($fh);
  1171. }
  1172. }
  1173. // process custom decompositions
  1174. $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
  1175. if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
  1176. $fh = fopen($customTranslitFile, 'rb');
  1177. if ($fh) {
  1178. while (!feof($fh)) {
  1179. $line = fgets($fh, 4096);
  1180. if ($line{0} != '#' && trim($line) != '') {
  1181. list($char, $translit) = t3lib_div::trimExplode(';', $line);
  1182. if (!$translit) {
  1183. $omit["U+$char"] = 1;
  1184. }
  1185. $decomposition["U+$char"] = explode(' ', $translit);
  1186. }
  1187. }
  1188. fclose($fh);
  1189. }
  1190. }
  1191. // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
  1192. foreach ($decomposition as $from => $to) {
  1193. $code_decomp = array();
  1194. while ($code_value = array_shift($to)) {
  1195. if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
  1196. foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
  1197. array_unshift($to, $cv);
  1198. }
  1199. } elseif (!isset($mark["U+$code_value"])) { // remove mark
  1200. array_push($code_decomp, $code_value);
  1201. }
  1202. }
  1203. if (count($code_decomp) || isset($omit[$from])) {
  1204. $decomposition[$from] = $code_decomp;
  1205. } else {
  1206. unset($decomposition[$from]);
  1207. }
  1208. }
  1209. // create ascii only mapping
  1210. $this->toASCII['utf-8'] = array();
  1211. $ascii =& $this->toASCII['utf-8'];
  1212. foreach ($decomposition as $from => $to) {
  1213. $code_decomp = array();
  1214. while ($code_value = array_shift($to)) {
  1215. $ord = hexdec($code_value);
  1216. if ($ord > 127) {
  1217. continue 2;
  1218. } // skip decompositions containing non-ASCII chars
  1219. else
  1220. {
  1221. array_push($code_decomp, chr($ord));
  1222. }
  1223. }
  1224. $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
  1225. }
  1226. // add numeric decompositions
  1227. foreach ($number as $from => $to) {
  1228. $utf8_char = $this->UnumberToChar(hexdec($from));
  1229. if (!isset($ascii[$utf8_char])) {
  1230. $ascii[$utf8_char] = $to;
  1231. }
  1232. }
  1233. if ($cacheFileCase) {
  1234. t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
  1235. }
  1236. if ($cacheFileASCII) {
  1237. t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
  1238. }
  1239. return 3;
  1240. }
  1241. /**
  1242. * This function initializes the folding table for a charset other than UTF-8.
  1243. * This function is automatically called by the case folding functions.
  1244. *
  1245. * @param string Charset for which to initialize case folding.
  1246. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1247. * @access private
  1248. */
  1249. function initCaseFolding($charset) {
  1250. // Only process if the case table is not yet loaded:
  1251. if (is_array($this->caseFolding[$charset])) {
  1252. return 1;
  1253. }
  1254. // Use cached version if possible
  1255. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
  1256. if ($cacheFile && @is_file($cacheFile)) {
  1257. $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1258. return 2;
  1259. }
  1260. // init UTF-8 conversion for this charset
  1261. if (!$this->initCharset($charset)) {
  1262. return FALSE;
  1263. }
  1264. // UTF-8 case folding is used as the base conversion table
  1265. if (!$this->initUnicodeData('case')) {
  1266. return FALSE;
  1267. }
  1268. $nochar = chr($this->noCharByteVal);
  1269. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1270. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1271. $c = $this->utf8_decode($utf8, $charset);
  1272. // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
  1273. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
  1274. if ($cc != '' && $cc != $nochar) {
  1275. $this->caseFolding[$charset]['toUpper'][$c] = $cc;
  1276. }
  1277. // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
  1278. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
  1279. if ($cc != '' && $cc != $nochar) {
  1280. $this->caseFolding[$charset]['toLower'][$c] = $cc;
  1281. }
  1282. // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
  1283. $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
  1284. if ($cc != '' && $cc != $nochar) {
  1285. $this->caseFolding[$charset]['toTitle'][$c] = $cc;
  1286. }
  1287. }
  1288. // add the ASCII case table
  1289. for ($i = ord('a'); $i <= ord('z'); $i++) {
  1290. $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
  1291. }
  1292. for ($i = ord('A'); $i <= ord('Z'); $i++) {
  1293. $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
  1294. }
  1295. if ($cacheFile) {
  1296. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
  1297. }
  1298. return 3;
  1299. }
  1300. /**
  1301. * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
  1302. * This function is automatically called by the ASCII transliteration functions.
  1303. *
  1304. * @param string Charset for which to initialize conversion.
  1305. * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
  1306. * @access private
  1307. */
  1308. function initToASCII($charset) {
  1309. // Only process if the case table is not yet loaded:
  1310. if (is_array($this->toASCII[$charset])) {
  1311. return 1;
  1312. }
  1313. // Use cached version if possible
  1314. $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
  1315. if ($cacheFile && @is_file($cacheFile)) {
  1316. $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
  1317. return 2;
  1318. }
  1319. // init UTF-8 conversion for this charset
  1320. if (!$this->initCharset($charset)) {
  1321. return FALSE;
  1322. }
  1323. // UTF-8/ASCII transliteration is used as the base conversion table
  1324. if (!$this->initUnicodeData('ascii')) {
  1325. return FALSE;
  1326. }
  1327. $nochar = chr($this->noCharByteVal);
  1328. foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
  1329. // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
  1330. $c = $this->utf8_decode($utf8, $charset);
  1331. if (isset($this->toASCII['utf-8'][$utf8])) {
  1332. $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
  1333. }
  1334. }
  1335. if ($cacheFile) {
  1336. t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
  1337. }
  1338. return 3;
  1339. }
  1340. /********************************************
  1341. *
  1342. * String operation functions
  1343. *
  1344. ********************************************/
  1345. /**
  1346. * Returns a part of a string.
  1347. * Unit-tested by Kasper (single byte charsets only)
  1348. *
  1349. * @param string The character set
  1350. * @param string Character string
  1351. * @param integer Start position (character position)
  1352. * @param integer Length (in characters)
  1353. * @return string The substring
  1354. * @see substr(), mb_substr()
  1355. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1356. */
  1357. function substr($charset, $string, $start, $len = NULL) {
  1358. if ($len === 0 || $string === '') {
  1359. return '';
  1360. }
  1361. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1362. // cannot omit $len, when specifying charset
  1363. if ($len == NULL) {
  1364. $enc = mb_internal_encoding(); // save internal encoding
  1365. mb_internal_encoding($charset);
  1366. $str = mb_substr($string, $start);
  1367. mb_internal_encoding($enc); // restore internal encoding
  1368. return $str;
  1369. }
  1370. else {
  1371. return mb_substr($string, $start, $len, $charset);
  1372. }
  1373. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1374. // cannot omit $len, when specifying charset
  1375. if ($len == NULL) {
  1376. $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
  1377. iconv_set_encoding('internal_encoding', $charset);
  1378. $str = iconv_substr($string, $start);
  1379. iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
  1380. return $str;
  1381. }
  1382. else {
  1383. return iconv_substr($string, $start, $len, $charset);
  1384. }
  1385. } elseif ($charset == 'utf-8') {
  1386. return $this->utf8_substr($string, $start, $len);
  1387. } elseif ($this->eucBasedSets[$charset]) {
  1388. return $this->euc_substr($string, $start, $charset, $len);
  1389. } elseif ($this->twoByteSets[$charset]) {
  1390. return substr($string, $start * 2, $len * 2);
  1391. } elseif ($this->fourByteSets[$charset]) {
  1392. return substr($string, $start * 4, $len * 4);
  1393. }
  1394. // treat everything else as single-byte encoding
  1395. return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
  1396. }
  1397. /**
  1398. * Counts the number of characters.
  1399. * Unit-tested by Kasper (single byte charsets only)
  1400. *
  1401. * @param string The character set
  1402. * @param string Character string
  1403. * @return integer The number of characters
  1404. * @see strlen()
  1405. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1406. */
  1407. function strlen($charset, $string) {
  1408. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1409. return mb_strlen($string, $charset);
  1410. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1411. return iconv_strlen($string, $charset);
  1412. } elseif ($charset == 'utf-8') {
  1413. return $this->utf8_strlen($string);
  1414. } elseif ($this->eucBasedSets[$charset]) {
  1415. return $this->euc_strlen($string, $charset);
  1416. } elseif ($this->twoByteSets[$charset]) {
  1417. return strlen($string) / 2;
  1418. } elseif ($this->fourByteSets[$charset]) {
  1419. return strlen($string) / 4;
  1420. }
  1421. // treat everything else as single-byte encoding
  1422. return strlen($string);
  1423. }
  1424. /**
  1425. * Method to crop strings using the mb_substr function.
  1426. *
  1427. * @param string The character set
  1428. * @param string String to be cropped
  1429. * @param integer Crop length (in characters)
  1430. * @param string Crop signifier
  1431. * @return string The shortened string
  1432. * @see mb_strlen(), mb_substr()
  1433. */
  1434. protected function cropMbstring($charset, $string, $len, $crop = '') {
  1435. if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
  1436. return $string;
  1437. }
  1438. if ($len > 0) {
  1439. $string = mb_substr($string, 0, $len, $charset) . $crop;
  1440. } else {
  1441. $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
  1442. }
  1443. return $string;
  1444. }
  1445. /**
  1446. * Truncates a string and pre-/appends a string.
  1447. * Unit tested by Kasper
  1448. *
  1449. * @param string The character set
  1450. * @param string Character string
  1451. * @param integer Length (in characters)
  1452. * @param string Crop signifier
  1453. * @return string The shortened string
  1454. * @see substr(), mb_strimwidth()
  1455. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1456. */
  1457. function crop($charset, $string, $len, $crop = '') {
  1458. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1459. return $this->cropMbstring($charset, $string, $len, $crop);
  1460. }
  1461. if (intval($len) == 0) {
  1462. return $string;
  1463. }
  1464. if ($charset == 'utf-8') {
  1465. $i = $this->utf8_char2byte_pos($string, $len);
  1466. } elseif ($this->eucBasedSets[$charset]) {
  1467. $i = $this->euc_char2byte_pos($string, $len, $charset);
  1468. } else {
  1469. if ($len > 0) {
  1470. $i = $len;
  1471. } else {
  1472. $i = strlen($string) + $len;
  1473. if ($i <= 0) {
  1474. $i = FALSE;
  1475. }
  1476. }
  1477. }
  1478. if ($i === FALSE) { // $len outside actual string length
  1479. return $string;
  1480. } else {
  1481. if ($len > 0) {
  1482. if (strlen($string{$i})) {
  1483. return substr($string, 0, $i) . $crop;
  1484. }
  1485. } else {
  1486. if (strlen($string{$i - 1})) {
  1487. return $crop . substr($string, $i);
  1488. }
  1489. }
  1490. /*
  1491. if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
  1492. if ($len > 0) {
  1493. return substr($string,0,$i).$crop;
  1494. } else {
  1495. return $crop.substr($string,$i);
  1496. }
  1497. }
  1498. */
  1499. }
  1500. return $string;
  1501. }
  1502. /**
  1503. * Cuts a string short at a given byte length.
  1504. *
  1505. * @param string The character set
  1506. * @param string Character string
  1507. * @param integer The byte length
  1508. * @return string The shortened string
  1509. * @see mb_strcut()
  1510. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1511. */
  1512. function strtrunc($charset, $string, $len) {
  1513. if ($len <= 0) {
  1514. return '';
  1515. }
  1516. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1517. return mb_strcut($string, 0, $len, $charset);
  1518. } elseif ($charset == 'utf-8') {
  1519. return $this->utf8_strtrunc($string, $len);
  1520. } elseif ($this->eucBasedSets[$charset]) {
  1521. return $this->euc_strtrunc($string, $len, $charset);
  1522. } elseif ($this->twoByteSets[$charset]) {
  1523. if ($len % 2) {
  1524. $len--;
  1525. } // don't cut at odd positions
  1526. } elseif ($this->fourByteSets[$charset]) {
  1527. $x = $len % 4;
  1528. $len -= $x; // realign to position dividable by four
  1529. }
  1530. // treat everything else as single-byte encoding
  1531. return substr($string, 0, $len);
  1532. }
  1533. /**
  1534. * Translates all characters of a string into their respective case values.
  1535. * Unlike strtolower() and strtoupper() this method is locale independent.
  1536. * Note that the string length may change!
  1537. * eg. lower case German "ß" (sharp S) becomes upper case "SS"
  1538. * Unit-tested by Kasper
  1539. * Real case folding is language dependent, this method ignores this fact.
  1540. *
  1541. * @param string Character set of string
  1542. * @param string Input string to convert case for
  1543. * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
  1544. * @return string The converted string
  1545. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1546. * @see strtolower(), strtoupper()
  1547. */
  1548. function conv_case($charset, $string, $case) {
  1549. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1550. if ($case == 'toLower') {
  1551. $string = mb_strtolower($string, $charset);
  1552. } else {
  1553. $string = mb_strtoupper($string, $charset);
  1554. }
  1555. } elseif ($charset == 'utf-8') {
  1556. $string = $this->utf8_char_mapping($string, 'case', $case);
  1557. } elseif (isset($this->eucBasedSets[$charset])) {
  1558. $string = $this->euc_char_mapping($string, $charset, 'case', $case);
  1559. } else {
  1560. // treat everything else as single-byte encoding
  1561. $string = $this->sb_char_mapping($string, $charset, 'case', $case);
  1562. }
  1563. return $string;
  1564. }
  1565. /**
  1566. * Converts special chars (like ćřĺĆŘĹ, umlauts etc) to ascii equivalents (usually double-bytes, like ć => ae etc.)
  1567. *
  1568. * @param string Character set of string
  1569. * @param string Input string to convert
  1570. * @return string The converted string
  1571. */
  1572. function specCharsToASCII($charset, $string) {
  1573. if ($charset == 'utf-8') {
  1574. $string = $this->utf8_char_mapping($string, 'ascii');
  1575. } elseif (isset($this->eucBasedSets[$charset])) {
  1576. $string = $this->euc_char_mapping($string, $charset, 'ascii');
  1577. } else {
  1578. // treat everything else as single-byte encoding
  1579. $string = $this->sb_char_mapping($string, $charset, 'ascii');
  1580. }
  1581. return $string;
  1582. }
  1583. /**
  1584. * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
  1585. * into a TYPO3-readable language code
  1586. * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
  1587. * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
  1588. * @return string a preferred language that TYPO3 supports, or "default" if none found
  1589. * @author Benjamin Mack (benni.typo3.org)
  1590. */
  1591. public function getPreferredClientLanguage($languageCodesList) {
  1592. $allLanguageCodes = array();
  1593. $selectedLanguage = 'default';
  1594. // get all languages where TYPO3 code is the same as the ISO code
  1595. foreach ($this->charSetArray as $typo3Lang => $charSet) {
  1596. $allLanguageCodes[$typo3Lang] = $typo3Lang;
  1597. }
  1598. // get all languages where TYPO3 code differs from ISO code
  1599. // or needs the country part
  1600. // the iso codes will here overwrite the default typo3 language in the key
  1601. foreach ($this->isoArray as $typo3Lang => $isoLang) {
  1602. $isoLang = join('-', explode('_', $isoLang));
  1603. $allLanguageCodes[$typo3Lang] = $isoLang;
  1604. }
  1605. // move the iso codes to the (because we're comparing the keys with "isset" later on)
  1606. $allLanguageCodes = array_flip($allLanguageCodes);
  1607. $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
  1608. // order the preferred languages after they key
  1609. $sortedPreferredLanguages = array();
  1610. foreach ($preferredLanguages as $preferredLanguage) {
  1611. $quality = 1.0;
  1612. if (strpos($preferredLanguage, ';q=') !== FALSE) {
  1613. list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
  1614. }
  1615. $sortedPreferredLanguages[$preferredLanguage] = $quality;
  1616. }
  1617. // loop through the languages, with the highest priority first
  1618. arsort($sortedPreferredLanguages, SORT_NUMERIC);
  1619. foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
  1620. if (isset($allLanguageCodes[$preferredLanguage])) {
  1621. $selectedLanguage = $allLanguageCodes[$preferredLanguage];
  1622. break;
  1623. }
  1624. // strip the country code from the end
  1625. list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
  1626. if (isset($allLanguageCodes[$preferredLanguage])) {
  1627. $selectedLanguage = $allLanguageCodes[$preferredLanguage];
  1628. break;
  1629. }
  1630. }
  1631. if (!$selectedLanguage || $selectedLanguage == 'en') {
  1632. $selectedLanguage = 'default';
  1633. }
  1634. return $selectedLanguage;
  1635. }
  1636. /********************************************
  1637. *
  1638. * Internal string operation functions
  1639. *
  1640. ********************************************/
  1641. /**
  1642. * Maps all characters of a string in a single byte charset.
  1643. *
  1644. * @param string the string
  1645. * @param string the charset
  1646. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  1647. * @param string 'case': conversion 'toLower' or 'toUpper'
  1648. * @return string the converted string
  1649. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1650. */
  1651. function sb_char_mapping($str, $charset, $mode, $opt = '') {
  1652. switch ($mode) {
  1653. case 'case':
  1654. if (!$this->initCaseFolding($charset)) {
  1655. return $str;
  1656. } // do nothing
  1657. $map =& $this->caseFolding[$charset][$opt];
  1658. break;
  1659. case 'ascii':
  1660. if (!$this->initToASCII($charset)) {
  1661. return $str;
  1662. } // do nothing
  1663. $map =& $this->toASCII[$charset];
  1664. break;
  1665. default:
  1666. return $str;
  1667. }
  1668. $out = '';
  1669. for ($i = 0; strlen($str{$i}); $i++) {
  1670. $c = $str{$i};
  1671. if (isset($map[$c])) {
  1672. $out .= $map[$c];
  1673. } else {
  1674. $out .= $c;
  1675. }
  1676. }
  1677. return $out;
  1678. }
  1679. /********************************************
  1680. *
  1681. * Internal UTF-8 string operation functions
  1682. *
  1683. ********************************************/
  1684. /**
  1685. * Returns a part of a UTF-8 string.
  1686. * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
  1687. *
  1688. * @param string UTF-8 string
  1689. * @param integer Start position (character position)
  1690. * @param integer Length (in characters)
  1691. * @return string The substring
  1692. * @see substr()
  1693. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1694. */
  1695. function utf8_substr($str, $start, $len = NULL) {
  1696. if (!strcmp($len, '0')) {
  1697. return '';
  1698. }
  1699. $byte_start = $this->utf8_char2byte_pos($str, $start);
  1700. if ($byte_start === FALSE) {
  1701. if ($start > 0) {
  1702. return FALSE; // $start outside string length
  1703. } else {
  1704. $start = 0;
  1705. }
  1706. }
  1707. $str = substr($str, $byte_start);
  1708. if ($len != NULL) {
  1709. $byte_end = $this->utf8_char2byte_pos($str, $len);
  1710. if ($byte_end === FALSE) // $len outside actual string length
  1711. {
  1712. return $len < 0 ? '' : $str;
  1713. } // When length is less than zero and exceeds, then we return blank string.
  1714. else
  1715. {
  1716. return substr($str, 0, $byte_end);
  1717. }
  1718. }
  1719. else {
  1720. return $str;
  1721. }
  1722. }
  1723. /**
  1724. * Counts the number of characters of a string in UTF-8.
  1725. * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
  1726. *
  1727. * @param string UTF-8 multibyte character string
  1728. * @return integer The number of characters
  1729. * @see strlen()
  1730. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1731. */
  1732. function utf8_strlen($str) {
  1733. $n = 0;
  1734. for ($i = 0; strlen($str{$i}); $i++) {
  1735. $c = ord($str{$i});
  1736. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1737. {
  1738. $n++;
  1739. }
  1740. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1741. {
  1742. $n++;
  1743. }
  1744. }
  1745. return $n;
  1746. }
  1747. /**
  1748. * Truncates a string in UTF-8 short at a given byte length.
  1749. *
  1750. * @param string UTF-8 multibyte character string
  1751. * @param integer the byte length
  1752. * @return string the shortened string
  1753. * @see mb_strcut()
  1754. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1755. */
  1756. function utf8_strtrunc($str, $len) {
  1757. $i = $len - 1;
  1758. if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
  1759. for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
  1760. if ($i <= 0) {
  1761. return '';
  1762. } // sanity check
  1763. for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
  1764. if ($bc + $i > $len) {
  1765. return substr($str, 0, $i);
  1766. }
  1767. // fallthru: multibyte char fits into length
  1768. }
  1769. return substr($str, 0, $len);
  1770. }
  1771. /**
  1772. * Find position of first occurrence of a string, both arguments are in UTF-8.
  1773. *
  1774. * @param string UTF-8 string to search in
  1775. * @param string UTF-8 string to search for
  1776. * @param integer Positition to start the search
  1777. * @return integer The character position
  1778. * @see strpos()
  1779. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1780. */
  1781. function utf8_strpos($haystack, $needle, $offset = 0) {
  1782. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1783. return mb_strpos($haystack, $needle, $offset, 'utf-8');
  1784. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1785. return iconv_strpos($haystack, $needle, $offset, 'utf-8');
  1786. }
  1787. $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
  1788. if ($byte_offset === FALSE) {
  1789. return FALSE;
  1790. } // offset beyond string length
  1791. $byte_pos = strpos($haystack, $needle, $byte_offset);
  1792. if ($byte_pos === FALSE) {
  1793. return FALSE;
  1794. } // needle not found
  1795. return $this->utf8_byte2char_pos($haystack, $byte_pos);
  1796. }
  1797. /**
  1798. * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
  1799. *
  1800. * @param string UTF-8 string to search in
  1801. * @param string UTF-8 character to search for (single character)
  1802. * @return integer The character position
  1803. * @see strrpos()
  1804. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1805. */
  1806. function utf8_strrpos($haystack, $needle) {
  1807. if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
  1808. return mb_strrpos($haystack, $needle, 'utf-8');
  1809. } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
  1810. return iconv_strrpos($haystack, $needle, 'utf-8');
  1811. }
  1812. $byte_pos = strrpos($haystack, $needle);
  1813. if ($byte_pos === FALSE) {
  1814. return FALSE;
  1815. } // needle not found
  1816. return $this->utf8_byte2char_pos($haystack, $byte_pos);
  1817. }
  1818. /**
  1819. * Translates a character position into an 'absolute' byte position.
  1820. * Unit tested by Kasper.
  1821. *
  1822. * @param string UTF-8 string
  1823. * @param integer Character position (negative values start from the end)
  1824. * @return integer Byte position
  1825. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1826. */
  1827. function utf8_char2byte_pos($str, $pos) {
  1828. $n = 0; // number of characters found
  1829. $p = abs($pos); // number of characters wanted
  1830. if ($pos >= 0) {
  1831. $i = 0;
  1832. $d = 1;
  1833. } else {
  1834. $i = strlen($str) - 1;
  1835. $d = -1;
  1836. }
  1837. for (; strlen($str{$i}) && $n < $p; $i += $d) {
  1838. $c = (int) ord($str{$i});
  1839. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1840. {
  1841. $n++;
  1842. }
  1843. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1844. {
  1845. $n++;
  1846. }
  1847. }
  1848. if (!strlen($str{$i})) {
  1849. return FALSE;
  1850. } // offset beyond string length
  1851. if ($pos >= 0) {
  1852. // skip trailing multi-byte data bytes
  1853. while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
  1854. $i++;
  1855. }
  1856. } else {
  1857. // correct offset
  1858. $i++;
  1859. }
  1860. return $i;
  1861. }
  1862. /**
  1863. * Translates an 'absolute' byte position into a character position.
  1864. * Unit tested by Kasper.
  1865. *
  1866. * @param string UTF-8 string
  1867. * @param integer byte position
  1868. * @return integer character position
  1869. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1870. */
  1871. function utf8_byte2char_pos($str, $pos) {
  1872. $n = 0; // number of characters
  1873. for ($i = $pos; $i > 0; $i--) {
  1874. $c = (int) ord($str{$i});
  1875. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1876. {
  1877. $n++;
  1878. }
  1879. elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
  1880. {
  1881. $n++;
  1882. }
  1883. }
  1884. if (!strlen($str{$i})) {
  1885. return FALSE;
  1886. } // offset beyond string length
  1887. return $n;
  1888. }
  1889. /**
  1890. * Maps all characters of an UTF-8 string.
  1891. *
  1892. * @param string UTF-8 string
  1893. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  1894. * @param string 'case': conversion 'toLower' or 'toUpper'
  1895. * @return string the converted string
  1896. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1897. */
  1898. function utf8_char_mapping($str, $mode, $opt = '') {
  1899. if (!$this->initUnicodeData($mode)) {
  1900. return $str;
  1901. } // do nothing
  1902. $out = '';
  1903. switch ($mode) {
  1904. case 'case':
  1905. $map =& $this->caseFolding['utf-8'][$opt];
  1906. break;
  1907. case 'ascii':
  1908. $map =& $this->toASCII['utf-8'];
  1909. break;
  1910. default:
  1911. return $str;
  1912. }
  1913. for ($i = 0; strlen($str{$i}); $i++) {
  1914. $c = ord($str{$i});
  1915. if (!($c & 0x80)) // single-byte (0xxxxxx)
  1916. {
  1917. $mbc = $str{$i};
  1918. }
  1919. elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
  1920. for ($bc = 0; $c & 0x80; $c = $c << 1) {
  1921. $bc++;
  1922. } // calculate number of bytes
  1923. $mbc = substr($str, $i, $bc);
  1924. $i += $bc - 1;
  1925. }
  1926. if (isset($map[$mbc])) {
  1927. $out .= $map[$mbc];
  1928. } else {
  1929. $out .= $mbc;
  1930. }
  1931. }
  1932. return $out;
  1933. }
  1934. /********************************************
  1935. *
  1936. * Internal EUC string operation functions
  1937. *
  1938. * Extended Unix Code:
  1939. * ASCII compatible 7bit single bytes chars
  1940. * 8bit two byte chars
  1941. *
  1942. * Shift-JIS is treated as a special case.
  1943. *
  1944. ********************************************/
  1945. /**
  1946. * Cuts a string in the EUC charset family short at a given byte length.
  1947. *
  1948. * @param string EUC multibyte character string
  1949. * @param integer the byte length
  1950. * @param string the charset
  1951. * @return string the shortened string
  1952. * @see mb_strcut()
  1953. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1954. */
  1955. function euc_strtrunc($str, $len, $charset) {
  1956. $sjis = ($charset == 'shift_jis');
  1957. for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
  1958. $c = ord($str{$i});
  1959. if ($sjis) {
  1960. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  1961. $i++;
  1962. } // advance a double-byte char
  1963. }
  1964. else {
  1965. if ($c >= 0x80) {
  1966. $i++;
  1967. } // advance a double-byte char
  1968. }
  1969. }
  1970. if (!strlen($str{$i})) {
  1971. return $str;
  1972. } // string shorter than supplied length
  1973. if ($i > $len) {
  1974. return substr($str, 0, $len - 1); // we ended on a first byte
  1975. } else {
  1976. return substr($str, 0, $len);
  1977. }
  1978. }
  1979. /**
  1980. * Returns a part of a string in the EUC charset family.
  1981. *
  1982. * @param string EUC multibyte character string
  1983. * @param integer start position (character position)
  1984. * @param string the charset
  1985. * @param integer length (in characters)
  1986. * @return string the substring
  1987. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  1988. */
  1989. function euc_substr($str, $start, $charset, $len = NULL) {
  1990. $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
  1991. if ($byte_start === FALSE) {
  1992. return FALSE;
  1993. } // $start outside string length
  1994. $str = substr($str, $byte_start);
  1995. if ($len != NULL) {
  1996. $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
  1997. if ($byte_end === FALSE) // $len outside actual string length
  1998. {
  1999. return $str;
  2000. }
  2001. else
  2002. {
  2003. return substr($str, 0, $byte_end);
  2004. }
  2005. }
  2006. else {
  2007. return $str;
  2008. }
  2009. }
  2010. /**
  2011. * Counts the number of characters of a string in the EUC charset family.
  2012. *
  2013. * @param string EUC multibyte character string
  2014. * @param string the charset
  2015. * @return integer the number of characters
  2016. * @see strlen()
  2017. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2018. */
  2019. function euc_strlen($str, $charset) {
  2020. $sjis = ($charset == 'shift_jis');
  2021. $n = 0;
  2022. for ($i = 0; strlen($str{$i}); $i++) {
  2023. $c = ord($str{$i});
  2024. if ($sjis) {
  2025. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  2026. $i++;
  2027. } // advance a double-byte char
  2028. }
  2029. else {
  2030. if ($c >= 0x80) {
  2031. $i++;
  2032. } // advance a double-byte char
  2033. }
  2034. $n++;
  2035. }
  2036. return $n;
  2037. }
  2038. /**
  2039. * Translates a character position into an 'absolute' byte position.
  2040. *
  2041. * @param string EUC multibyte character string
  2042. * @param integer character position (negative values start from the end)
  2043. * @param string the charset
  2044. * @return integer byte position
  2045. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2046. */
  2047. function euc_char2byte_pos($str, $pos, $charset) {
  2048. $sjis = ($charset == 'shift_jis');
  2049. $n = 0; // number of characters seen
  2050. $p = abs($pos); // number of characters wanted
  2051. if ($pos >= 0) {
  2052. $i = 0;
  2053. $d = 1;
  2054. } else {
  2055. $i = strlen($str) - 1;
  2056. $d = -1;
  2057. }
  2058. for (; strlen($str{$i}) && $n < $p; $i += $d) {
  2059. $c = ord($str{$i});
  2060. if ($sjis) {
  2061. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
  2062. $i += $d;
  2063. } // advance a double-byte char
  2064. }
  2065. else {
  2066. if ($c >= 0x80) {
  2067. $i += $d;
  2068. } // advance a double-byte char
  2069. }
  2070. $n++;
  2071. }
  2072. if (!strlen($str{$i})) {
  2073. return FALSE;
  2074. } // offset beyond string length
  2075. if ($pos < 0) {
  2076. $i++;
  2077. } // correct offset
  2078. return $i;
  2079. }
  2080. /**
  2081. * Maps all characters of a string in the EUC charset family.
  2082. *
  2083. * @param string EUC multibyte character string
  2084. * @param string the charset
  2085. * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
  2086. * @param string 'case': conversion 'toLower' or 'toUpper'
  2087. * @return string the converted string
  2088. * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
  2089. */
  2090. function euc_char_mapping($str, $charset, $mode, $opt = '') {
  2091. switch ($mode) {
  2092. case 'case':
  2093. if (!$this->initCaseFolding($charset)) {
  2094. return $str;
  2095. } // do nothing
  2096. $map =& $this->caseFolding[$charset][$opt];
  2097. break;
  2098. case 'ascii':
  2099. if (!$this->initToASCII($charset)) {
  2100. return $str;
  2101. } // do nothing
  2102. $map =& $this->toASCII[$charset];
  2103. break;
  2104. default:
  2105. return $str;
  2106. }
  2107. $sjis = ($charset == 'shift_jis');
  2108. $out = '';
  2109. for ($i = 0; strlen($str{$i}); $i++) {
  2110. $mbc = $str{$i};
  2111. $c = ord($mbc);
  2112. if ($sjis) {
  2113. if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
  2114. $mbc = substr($str, $i, 2);
  2115. $i++;
  2116. }
  2117. }
  2118. else {
  2119. if ($c >= 0x80) { // a double-byte char
  2120. $mbc = substr($str, $i, 2);
  2121. $i++;
  2122. }
  2123. }
  2124. if (isset($map[$mbc])) {
  2125. $out .= $map[$mbc];
  2126. } else {
  2127. $out .= $mbc;
  2128. }
  2129. }
  2130. return $out;
  2131. }
  2132. }
  2133. if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
  2134. include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
  2135. }
  2136. ?>