PageRenderTime 48ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 1ms

/hphp/runtime/ext/ext_mb.cpp

http://github.com/facebook/hiphop-php
C++ | 4191 lines | 3589 code | 415 blank | 187 comment | 982 complexity | 6654a83821b70924cecdcf76a157ee43 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. +----------------------------------------------------------------------+
  3. | HipHop for PHP |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
  6. | Copyright (c) 1997-2010 The PHP Group |
  7. +----------------------------------------------------------------------+
  8. | This source file is subject to version 3.01 of the PHP license, |
  9. | that is bundled with this package in the file LICENSE, and is |
  10. | available through the world-wide-web at the following url: |
  11. | http://www.php.net/license/3_01.txt |
  12. | If you did not receive a copy of the PHP license and are unable to |
  13. | obtain it through the world-wide-web, please send a note to |
  14. | license@php.net so we can mail you a copy immediately. |
  15. +----------------------------------------------------------------------+
  16. */
  17. #include "hphp/runtime/ext/ext_mb.h"
  18. #include "hphp/runtime/base/string-buffer.h"
  19. #include "hphp/runtime/base/request-local.h"
  20. #include "hphp/runtime/ext/php_unicode.h"
  21. #include "hphp/runtime/ext/unicode_data.h"
  22. #include "hphp/runtime/ext/ext_process.h"
  23. #include "hphp/runtime/ext/ext_string.h"
  24. #include "hphp/runtime/base/zend-url.h"
  25. #include "hphp/runtime/base/zend-string.h"
  26. #include "hphp/runtime/base/ini-setting.h"
  27. extern "C" {
  28. #include <mbfl/mbfl_convert.h>
  29. #include <mbfl/mbfilter.h>
  30. #include <oniguruma.h>
  31. }
  32. #define php_mb_re_pattern_buffer re_pattern_buffer
  33. #define php_mb_regex_t regex_t
  34. #define php_mb_re_registers re_registers
  35. extern void mbfl_memory_device_unput(mbfl_memory_device *device);
  36. #define PARSE_POST 0
  37. #define PARSE_GET 1
  38. #define PARSE_COOKIE 2
  39. #define PARSE_STRING 3
  40. #define PARSE_ENV 4
  41. #define PARSE_SERVER 5
  42. #define PARSE_SESSION 6
  43. namespace HPHP {
  44. static class mbstringExtension : public Extension {
  45. public:
  46. mbstringExtension() : Extension("mbstring") {}
  47. virtual void moduleInit() {
  48. IniSetting::SetGlobalDefault("mbstring.http_input", "pass");
  49. IniSetting::SetGlobalDefault("mbstring.http_output", "pass");
  50. }
  51. } s_mbstring_extension;
  52. ///////////////////////////////////////////////////////////////////////////////
  53. // statics
  54. #define PHP_MBSTR_STACK_BLOCK_SIZE 32
  55. typedef struct _php_mb_nls_ident_list {
  56. mbfl_no_language lang;
  57. mbfl_no_encoding* list;
  58. int list_size;
  59. } php_mb_nls_ident_list;
  60. static mbfl_no_encoding php_mb_default_identify_list_ja[] = {
  61. mbfl_no_encoding_ascii,
  62. mbfl_no_encoding_jis,
  63. mbfl_no_encoding_utf8,
  64. mbfl_no_encoding_euc_jp,
  65. mbfl_no_encoding_sjis
  66. };
  67. static mbfl_no_encoding php_mb_default_identify_list_cn[] = {
  68. mbfl_no_encoding_ascii,
  69. mbfl_no_encoding_utf8,
  70. mbfl_no_encoding_euc_cn,
  71. mbfl_no_encoding_cp936
  72. };
  73. static mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
  74. mbfl_no_encoding_ascii,
  75. mbfl_no_encoding_utf8,
  76. mbfl_no_encoding_euc_tw,
  77. mbfl_no_encoding_big5
  78. };
  79. static mbfl_no_encoding php_mb_default_identify_list_kr[] = {
  80. mbfl_no_encoding_ascii,
  81. mbfl_no_encoding_utf8,
  82. mbfl_no_encoding_euc_kr,
  83. mbfl_no_encoding_uhc
  84. };
  85. static mbfl_no_encoding php_mb_default_identify_list_ru[] = {
  86. mbfl_no_encoding_ascii,
  87. mbfl_no_encoding_utf8,
  88. mbfl_no_encoding_koi8r,
  89. mbfl_no_encoding_cp1251,
  90. mbfl_no_encoding_cp866
  91. };
  92. static mbfl_no_encoding php_mb_default_identify_list_hy[] = {
  93. mbfl_no_encoding_ascii,
  94. mbfl_no_encoding_utf8,
  95. mbfl_no_encoding_armscii8
  96. };
  97. static mbfl_no_encoding php_mb_default_identify_list_tr[] = {
  98. mbfl_no_encoding_ascii,
  99. mbfl_no_encoding_utf8,
  100. mbfl_no_encoding_8859_9
  101. };
  102. static mbfl_no_encoding php_mb_default_identify_list_neut[] = {
  103. mbfl_no_encoding_ascii,
  104. mbfl_no_encoding_utf8
  105. };
  106. static php_mb_nls_ident_list php_mb_default_identify_list[] = {
  107. { mbfl_no_language_japanese, php_mb_default_identify_list_ja,
  108. sizeof(php_mb_default_identify_list_ja) /
  109. sizeof(php_mb_default_identify_list_ja[0]) },
  110. { mbfl_no_language_korean, php_mb_default_identify_list_kr,
  111. sizeof(php_mb_default_identify_list_kr) /
  112. sizeof(php_mb_default_identify_list_kr[0]) },
  113. { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk,
  114. sizeof(php_mb_default_identify_list_tw_hk) /
  115. sizeof(php_mb_default_identify_list_tw_hk[0]) },
  116. { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn,
  117. sizeof(php_mb_default_identify_list_cn) /
  118. sizeof(php_mb_default_identify_list_cn[0]) },
  119. { mbfl_no_language_russian, php_mb_default_identify_list_ru,
  120. sizeof(php_mb_default_identify_list_ru) /
  121. sizeof(php_mb_default_identify_list_ru[0]) },
  122. { mbfl_no_language_armenian, php_mb_default_identify_list_hy,
  123. sizeof(php_mb_default_identify_list_hy) /
  124. sizeof(php_mb_default_identify_list_hy[0]) },
  125. { mbfl_no_language_turkish, php_mb_default_identify_list_tr,
  126. sizeof(php_mb_default_identify_list_tr) /
  127. sizeof(php_mb_default_identify_list_tr[0]) },
  128. { mbfl_no_language_neutral, php_mb_default_identify_list_neut,
  129. sizeof(php_mb_default_identify_list_neut) /
  130. sizeof(php_mb_default_identify_list_neut[0]) }
  131. };
  132. ///////////////////////////////////////////////////////////////////////////////
  133. // globals
  134. typedef std::map<std::string, php_mb_regex_t *> RegexCache;
  135. class MBGlobals : public RequestEventHandler {
  136. public:
  137. mbfl_no_language language;
  138. mbfl_no_language current_language;
  139. mbfl_no_encoding internal_encoding;
  140. mbfl_no_encoding current_internal_encoding;
  141. mbfl_no_encoding http_output_encoding;
  142. mbfl_no_encoding current_http_output_encoding;
  143. mbfl_no_encoding http_input_identify;
  144. mbfl_no_encoding http_input_identify_get;
  145. mbfl_no_encoding http_input_identify_post;
  146. mbfl_no_encoding http_input_identify_cookie;
  147. mbfl_no_encoding http_input_identify_string;
  148. mbfl_no_encoding *http_input_list;
  149. int http_input_list_size;
  150. mbfl_no_encoding *detect_order_list;
  151. int detect_order_list_size;
  152. mbfl_no_encoding *current_detect_order_list;
  153. int current_detect_order_list_size;
  154. mbfl_no_encoding *default_detect_order_list;
  155. int default_detect_order_list_size;
  156. int filter_illegal_mode;
  157. int filter_illegal_substchar;
  158. int current_filter_illegal_mode;
  159. int current_filter_illegal_substchar;
  160. bool encoding_translation;
  161. long strict_detection;
  162. long illegalchars;
  163. mbfl_buffer_converter *outconv;
  164. OnigEncoding default_mbctype;
  165. OnigEncoding current_mbctype;
  166. RegexCache ht_rc;
  167. std::string search_str;
  168. unsigned int search_pos;
  169. php_mb_regex_t *search_re;
  170. OnigRegion *search_regs;
  171. OnigOptionType regex_default_options;
  172. OnigSyntaxType *regex_default_syntax;
  173. MBGlobals() :
  174. language(mbfl_no_language_uni),
  175. current_language(mbfl_no_language_uni),
  176. internal_encoding(mbfl_no_encoding_utf8),
  177. current_internal_encoding(mbfl_no_encoding_utf8),
  178. http_output_encoding(mbfl_no_encoding_pass),
  179. current_http_output_encoding(mbfl_no_encoding_pass),
  180. http_input_identify(mbfl_no_encoding_invalid),
  181. http_input_identify_get(mbfl_no_encoding_invalid),
  182. http_input_identify_post(mbfl_no_encoding_invalid),
  183. http_input_identify_cookie(mbfl_no_encoding_invalid),
  184. http_input_identify_string(mbfl_no_encoding_invalid),
  185. http_input_list(NULL),
  186. http_input_list_size(0),
  187. detect_order_list(NULL),
  188. detect_order_list_size(0),
  189. current_detect_order_list(NULL),
  190. current_detect_order_list_size(0),
  191. default_detect_order_list
  192. ((mbfl_no_encoding *)php_mb_default_identify_list_neut),
  193. default_detect_order_list_size
  194. (sizeof(php_mb_default_identify_list_neut) /
  195. sizeof(php_mb_default_identify_list_neut[0])),
  196. filter_illegal_mode(MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR),
  197. filter_illegal_substchar(0x3f), /* '?' */
  198. current_filter_illegal_mode(MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR),
  199. current_filter_illegal_substchar(0x3f), /* '?' */
  200. encoding_translation(0),
  201. strict_detection(0),
  202. illegalchars(0),
  203. outconv(NULL),
  204. default_mbctype(ONIG_ENCODING_EUC_JP),
  205. current_mbctype(ONIG_ENCODING_EUC_JP),
  206. search_pos(0),
  207. search_re((php_mb_regex_t*)NULL),
  208. search_regs((OnigRegion*)NULL),
  209. regex_default_options(ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE),
  210. regex_default_syntax(ONIG_SYNTAX_RUBY) {
  211. }
  212. virtual void requestInit() {
  213. current_language = language;
  214. current_internal_encoding = internal_encoding;
  215. current_http_output_encoding = http_output_encoding;
  216. current_filter_illegal_mode = filter_illegal_mode;
  217. current_filter_illegal_substchar = filter_illegal_substchar;
  218. if (!encoding_translation) {
  219. illegalchars = 0;
  220. }
  221. mbfl_no_encoding *list=NULL, *entry;
  222. int n = 0;
  223. if (detect_order_list) {
  224. list = detect_order_list;
  225. n = detect_order_list_size;
  226. }
  227. if (n <= 0) {
  228. list = default_detect_order_list;
  229. n = default_detect_order_list_size;
  230. }
  231. entry = (mbfl_no_encoding *)malloc(n * sizeof(int));
  232. current_detect_order_list = entry;
  233. current_detect_order_list_size = n;
  234. while (n > 0) {
  235. *entry++ = *list++;
  236. n--;
  237. }
  238. }
  239. virtual void requestShutdown() {
  240. if (current_detect_order_list != NULL) {
  241. free(current_detect_order_list);
  242. current_detect_order_list = NULL;
  243. current_detect_order_list_size = 0;
  244. }
  245. if (outconv != NULL) {
  246. illegalchars += mbfl_buffer_illegalchars(outconv);
  247. mbfl_buffer_converter_delete(outconv);
  248. outconv = NULL;
  249. }
  250. /* clear http input identification. */
  251. http_input_identify = mbfl_no_encoding_invalid;
  252. http_input_identify_post = mbfl_no_encoding_invalid;
  253. http_input_identify_get = mbfl_no_encoding_invalid;
  254. http_input_identify_cookie = mbfl_no_encoding_invalid;
  255. http_input_identify_string = mbfl_no_encoding_invalid;
  256. current_mbctype = default_mbctype;
  257. search_str.clear();
  258. search_pos = 0;
  259. if (search_regs != NULL) {
  260. onig_region_free(search_regs, 1);
  261. search_regs = (OnigRegion *)NULL;
  262. }
  263. for (RegexCache::const_iterator it = ht_rc.begin(); it != ht_rc.end();
  264. ++it) {
  265. onig_free(it->second);
  266. }
  267. ht_rc.clear();
  268. }
  269. };
  270. IMPLEMENT_STATIC_REQUEST_LOCAL(MBGlobals, s_mb_globals);
  271. #define MBSTRG(name) s_mb_globals->name
  272. ///////////////////////////////////////////////////////////////////////////////
  273. // unicode functions
  274. /*
  275. * A simple array of 32-bit masks for lookup.
  276. */
  277. static unsigned long masks32[32] = {
  278. 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
  279. 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
  280. 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
  281. 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
  282. 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
  283. 0x40000000, 0x80000000
  284. };
  285. static int prop_lookup(unsigned long code, unsigned long n) {
  286. long l, r, m;
  287. /*
  288. * There is an extra node on the end of the offsets to allow this routine
  289. * to work right. If the index is 0xffff, then there are no nodes for the
  290. * property.
  291. */
  292. if ((l = _ucprop_offsets[n]) == 0xffff)
  293. return 0;
  294. /*
  295. * Locate the next offset that is not 0xffff. The sentinel at the end of
  296. * the array is the max index value.
  297. */
  298. for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
  299. ;
  300. r = _ucprop_offsets[n + m] - 1;
  301. while (l <= r) {
  302. /*
  303. * Determine a "mid" point and adjust to make sure the mid point is at
  304. * the beginning of a range pair.
  305. */
  306. m = (l + r) >> 1;
  307. m -= (m & 1);
  308. if (code > _ucprop_ranges[m + 1])
  309. l = m + 2;
  310. else if (code < _ucprop_ranges[m])
  311. r = m - 2;
  312. else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
  313. return 1;
  314. }
  315. return 0;
  316. }
  317. static int php_unicode_is_prop(unsigned long code, unsigned long mask1,
  318. unsigned long mask2) {
  319. unsigned long i;
  320. if (mask1 == 0 && mask2 == 0)
  321. return 0;
  322. for (i = 0; mask1 && i < 32; i++) {
  323. if ((mask1 & masks32[i]) && prop_lookup(code, i))
  324. return 1;
  325. }
  326. for (i = 32; mask2 && i < _ucprop_size; i++) {
  327. if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
  328. return 1;
  329. }
  330. return 0;
  331. }
  332. static unsigned long case_lookup(unsigned long code, long l, long r,
  333. int field) {
  334. long m;
  335. /*
  336. * Do the binary search.
  337. */
  338. while (l <= r) {
  339. /*
  340. * Determine a "mid" point and adjust to make sure the mid point is at
  341. * the beginning of a case mapping triple.
  342. */
  343. m = (l + r) >> 1;
  344. m -= (m % 3);
  345. if (code > _uccase_map[m])
  346. l = m + 3;
  347. else if (code < _uccase_map[m])
  348. r = m - 3;
  349. else if (code == _uccase_map[m])
  350. return _uccase_map[m + field];
  351. }
  352. return code;
  353. }
  354. static unsigned long php_turkish_toupper(unsigned long code, long l, long r,
  355. int field) {
  356. if (code == 0x0069L) {
  357. return 0x0130L;
  358. }
  359. return case_lookup(code, l, r, field);
  360. }
  361. static unsigned long php_turkish_tolower(unsigned long code, long l, long r,
  362. int field) {
  363. if (code == 0x0049L) {
  364. return 0x0131L;
  365. }
  366. return case_lookup(code, l, r, field);
  367. }
  368. static unsigned long php_unicode_toupper(unsigned long code,
  369. enum mbfl_no_encoding enc) {
  370. int field;
  371. long l, r;
  372. if (php_unicode_is_upper(code))
  373. return code;
  374. if (php_unicode_is_lower(code)) {
  375. /*
  376. * The character is lower case.
  377. */
  378. field = 2;
  379. l = _uccase_len[0];
  380. r = (l + _uccase_len[1]) - 3;
  381. if (enc == mbfl_no_encoding_8859_9) {
  382. return php_turkish_toupper(code, l, r, field);
  383. }
  384. } else {
  385. /*
  386. * The character is title case.
  387. */
  388. field = 1;
  389. l = _uccase_len[0] + _uccase_len[1];
  390. r = _uccase_size - 3;
  391. }
  392. return case_lookup(code, l, r, field);
  393. }
  394. static unsigned long php_unicode_tolower(unsigned long code,
  395. enum mbfl_no_encoding enc) {
  396. int field;
  397. long l, r;
  398. if (php_unicode_is_lower(code))
  399. return code;
  400. if (php_unicode_is_upper(code)) {
  401. /*
  402. * The character is upper case.
  403. */
  404. field = 1;
  405. l = 0;
  406. r = _uccase_len[0] - 3;
  407. if (enc == mbfl_no_encoding_8859_9) {
  408. return php_turkish_tolower(code, l, r, field);
  409. }
  410. } else {
  411. /*
  412. * The character is title case.
  413. */
  414. field = 2;
  415. l = _uccase_len[0] + _uccase_len[1];
  416. r = _uccase_size - 3;
  417. }
  418. return case_lookup(code, l, r, field);
  419. }
  420. static unsigned long php_unicode_totitle(unsigned long code,
  421. enum mbfl_no_encoding enc) {
  422. int field;
  423. long l, r;
  424. if (php_unicode_is_title(code))
  425. return code;
  426. /*
  427. * The offset will always be the same for converting to title case.
  428. */
  429. field = 2;
  430. if (php_unicode_is_upper(code)) {
  431. /*
  432. * The character is upper case.
  433. */
  434. l = 0;
  435. r = _uccase_len[0] - 3;
  436. } else {
  437. /*
  438. * The character is lower case.
  439. */
  440. l = _uccase_len[0];
  441. r = (l + _uccase_len[1]) - 3;
  442. }
  443. return case_lookup(code, l, r, field);
  444. }
  445. #define BE_ARY_TO_UINT32(ptr) (\
  446. ((unsigned char*)(ptr))[0]<<24 |\
  447. ((unsigned char*)(ptr))[1]<<16 |\
  448. ((unsigned char*)(ptr))[2]<< 8 |\
  449. ((unsigned char*)(ptr))[3] )
  450. #define UINT32_TO_BE_ARY(ptr,val) { \
  451. unsigned int v = val; \
  452. ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
  453. ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
  454. ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
  455. ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
  456. }
  457. /**
  458. * Return 0 if input contains any illegal encoding, otherwise 1.
  459. * Even if any illegal encoding is detected the result may contain a list
  460. * of parsed encodings.
  461. */
  462. static int php_mb_parse_encoding_list(const char *value, int value_length,
  463. mbfl_no_encoding **return_list,
  464. int *return_size, int persistent) {
  465. int n, l, size, bauto, ret = 1;
  466. char *p, *p1, *p2, *endp, *tmpstr;
  467. mbfl_no_encoding no_encoding;
  468. mbfl_no_encoding *src, *entry, *list;
  469. list = NULL;
  470. if (value == NULL || value_length <= 0) {
  471. if (return_list) {
  472. *return_list = NULL;
  473. }
  474. if (return_size) {
  475. *return_size = 0;
  476. }
  477. return 0;
  478. } else {
  479. mbfl_no_encoding *identify_list;
  480. int identify_list_size;
  481. identify_list = MBSTRG(default_detect_order_list);
  482. identify_list_size = MBSTRG(default_detect_order_list_size);
  483. /* copy the value string for work */
  484. if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
  485. tmpstr = (char *)strndup(value+1, value_length-2);
  486. value_length -= 2;
  487. }
  488. else
  489. tmpstr = (char *)strndup(value, value_length);
  490. if (tmpstr == NULL) {
  491. return 0;
  492. }
  493. /* count the number of listed encoding names */
  494. endp = tmpstr + value_length;
  495. n = 1;
  496. p1 = tmpstr;
  497. while ((p2 = (char*)string_memnstr(p1, ",", 1, endp)) != NULL) {
  498. p1 = p2 + 1;
  499. n++;
  500. }
  501. size = n + identify_list_size;
  502. /* make list */
  503. list = (mbfl_no_encoding *)calloc(size, sizeof(int));
  504. if (list != NULL) {
  505. entry = list;
  506. n = 0;
  507. bauto = 0;
  508. p1 = tmpstr;
  509. do {
  510. p2 = p = (char*)string_memnstr(p1, ",", 1, endp);
  511. if (p == NULL) {
  512. p = endp;
  513. }
  514. *p = '\0';
  515. /* trim spaces */
  516. while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
  517. p1++;
  518. }
  519. p--;
  520. while (p > p1 && (*p == ' ' || *p == '\t')) {
  521. *p = '\0';
  522. p--;
  523. }
  524. /* convert to the encoding number and check encoding */
  525. if (strcasecmp(p1, "auto") == 0) {
  526. if (!bauto) {
  527. bauto = 1;
  528. l = identify_list_size;
  529. src = identify_list;
  530. while (l > 0) {
  531. *entry++ = *src++;
  532. l--;
  533. n++;
  534. }
  535. }
  536. } else {
  537. no_encoding = mbfl_name2no_encoding(p1);
  538. if (no_encoding != mbfl_no_encoding_invalid) {
  539. *entry++ = no_encoding;
  540. n++;
  541. } else {
  542. ret = 0;
  543. }
  544. }
  545. p1 = p2 + 1;
  546. } while (n < size && p2 != NULL);
  547. if (n > 0) {
  548. if (return_list) {
  549. *return_list = list;
  550. } else {
  551. free(list);
  552. }
  553. } else {
  554. free(list);
  555. if (return_list) {
  556. *return_list = NULL;
  557. }
  558. ret = 0;
  559. }
  560. if (return_size) {
  561. *return_size = n;
  562. }
  563. } else {
  564. if (return_list) {
  565. *return_list = NULL;
  566. }
  567. if (return_size) {
  568. *return_size = 0;
  569. }
  570. ret = 0;
  571. }
  572. free(tmpstr);
  573. }
  574. return ret;
  575. }
  576. static char *php_mb_convert_encoding(const char *input, size_t length,
  577. const char *_to_encoding,
  578. const char *_from_encodings,
  579. unsigned int *output_len) {
  580. mbfl_string string, result, *ret;
  581. mbfl_no_encoding from_encoding, to_encoding;
  582. mbfl_buffer_converter *convd;
  583. int size;
  584. mbfl_no_encoding *list;
  585. char *output = NULL;
  586. if (output_len) {
  587. *output_len = 0;
  588. }
  589. if (!input) {
  590. return NULL;
  591. }
  592. /* new encoding */
  593. if (_to_encoding && strlen(_to_encoding)) {
  594. to_encoding = mbfl_name2no_encoding(_to_encoding);
  595. if (to_encoding == mbfl_no_encoding_invalid) {
  596. raise_warning("Unknown encoding \"%s\"", _to_encoding);
  597. return NULL;
  598. }
  599. } else {
  600. to_encoding = MBSTRG(current_internal_encoding);
  601. }
  602. /* initialize string */
  603. mbfl_string_init(&string);
  604. mbfl_string_init(&result);
  605. from_encoding = MBSTRG(current_internal_encoding);
  606. string.no_encoding = from_encoding;
  607. string.no_language = MBSTRG(current_language);
  608. string.val = (unsigned char *)input;
  609. string.len = length;
  610. /* pre-conversion encoding */
  611. if (_from_encodings) {
  612. list = NULL;
  613. size = 0;
  614. php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings),
  615. &list, &size, 0);
  616. if (size == 1) {
  617. from_encoding = *list;
  618. string.no_encoding = from_encoding;
  619. } else if (size > 1) {
  620. /* auto detect */
  621. from_encoding = mbfl_identify_encoding_no(&string, list, size,
  622. MBSTRG(strict_detection));
  623. if (from_encoding != mbfl_no_encoding_invalid) {
  624. string.no_encoding = from_encoding;
  625. } else {
  626. raise_warning("Unable to detect character encoding");
  627. from_encoding = mbfl_no_encoding_pass;
  628. to_encoding = from_encoding;
  629. string.no_encoding = from_encoding;
  630. }
  631. } else {
  632. raise_warning("Illegal character encoding specified");
  633. }
  634. if (list != NULL) {
  635. free((void *)list);
  636. }
  637. }
  638. /* initialize converter */
  639. convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len);
  640. if (convd == NULL) {
  641. raise_warning("Unable to create character encoding converter");
  642. return NULL;
  643. }
  644. mbfl_buffer_converter_illegal_mode
  645. (convd, MBSTRG(current_filter_illegal_mode));
  646. mbfl_buffer_converter_illegal_substchar
  647. (convd, MBSTRG(current_filter_illegal_substchar));
  648. /* do it */
  649. ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
  650. if (ret) {
  651. if (output_len) {
  652. *output_len = ret->len;
  653. }
  654. output = (char *)ret->val;
  655. }
  656. MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
  657. mbfl_buffer_converter_delete(convd);
  658. return output;
  659. }
  660. static char *php_unicode_convert_case(int case_mode, const char *srcstr,
  661. size_t srclen, unsigned int *ret_len,
  662. const char *src_encoding) {
  663. char *unicode, *newstr;
  664. unsigned int unicode_len;
  665. unsigned char *unicode_ptr;
  666. size_t i;
  667. enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
  668. unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding,
  669. &unicode_len);
  670. if (unicode == NULL)
  671. return NULL;
  672. unicode_ptr = (unsigned char *)unicode;
  673. switch(case_mode) {
  674. case PHP_UNICODE_CASE_UPPER:
  675. for (i = 0; i < unicode_len; i+=4) {
  676. UINT32_TO_BE_ARY(&unicode_ptr[i],
  677. php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]),
  678. _src_encoding));
  679. }
  680. break;
  681. case PHP_UNICODE_CASE_LOWER:
  682. for (i = 0; i < unicode_len; i+=4) {
  683. UINT32_TO_BE_ARY(&unicode_ptr[i],
  684. php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]),
  685. _src_encoding));
  686. }
  687. break;
  688. case PHP_UNICODE_CASE_TITLE:
  689. {
  690. int mode = 0;
  691. for (i = 0; i < unicode_len; i+=4) {
  692. int res = php_unicode_is_prop
  693. (BE_ARY_TO_UINT32(&unicode_ptr[i]),
  694. UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT, 0);
  695. if (mode) {
  696. if (res) {
  697. UINT32_TO_BE_ARY
  698. (&unicode_ptr[i],
  699. php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]),
  700. _src_encoding));
  701. } else {
  702. mode = 0;
  703. }
  704. } else {
  705. if (res) {
  706. mode = 1;
  707. UINT32_TO_BE_ARY
  708. (&unicode_ptr[i],
  709. php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]),
  710. _src_encoding));
  711. }
  712. }
  713. }
  714. }
  715. break;
  716. }
  717. newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding,
  718. "UCS-4BE", ret_len);
  719. free(unicode);
  720. return newstr;
  721. }
  722. ///////////////////////////////////////////////////////////////////////////////
  723. // helpers
  724. /**
  725. * Return 0 if input contains any illegal encoding, otherwise 1.
  726. * Even if any illegal encoding is detected the result may contain a list
  727. * of parsed encodings.
  728. */
  729. static int php_mb_parse_encoding_array(CArrRef array,
  730. mbfl_no_encoding **return_list,
  731. int *return_size, int persistent) {
  732. int n, l, size, bauto,ret = 1;
  733. mbfl_no_encoding no_encoding;
  734. mbfl_no_encoding *src, *list, *entry;
  735. list = NULL;
  736. mbfl_no_encoding *identify_list = MBSTRG(default_detect_order_list);
  737. int identify_list_size = MBSTRG(default_detect_order_list_size);
  738. size = array.size() + identify_list_size;
  739. list = (mbfl_no_encoding *)calloc(size, sizeof(int));
  740. if (list != NULL) {
  741. entry = list;
  742. bauto = 0;
  743. n = 0;
  744. for (ArrayIter iter(array); iter; ++iter) {
  745. String hash_entry = iter.second();
  746. if (strcasecmp(hash_entry.data(), "auto") == 0) {
  747. if (!bauto) {
  748. bauto = 1;
  749. l = identify_list_size;
  750. src = identify_list;
  751. while (l > 0) {
  752. *entry++ = *src++;
  753. l--;
  754. n++;
  755. }
  756. }
  757. } else {
  758. no_encoding = mbfl_name2no_encoding(hash_entry.data());
  759. if (no_encoding != mbfl_no_encoding_invalid) {
  760. *entry++ = no_encoding;
  761. n++;
  762. } else {
  763. ret = 0;
  764. }
  765. }
  766. }
  767. if (n > 0) {
  768. if (return_list) {
  769. *return_list = list;
  770. } else {
  771. free(list);
  772. }
  773. } else {
  774. free(list);
  775. if (return_list) {
  776. *return_list = NULL;
  777. }
  778. ret = 0;
  779. }
  780. if (return_size) {
  781. *return_size = n;
  782. }
  783. } else {
  784. if (return_list) {
  785. *return_list = NULL;
  786. }
  787. if (return_size) {
  788. *return_size = 0;
  789. }
  790. ret = 0;
  791. }
  792. return ret;
  793. }
  794. static bool php_mb_parse_encoding(CVarRef encoding,
  795. mbfl_no_encoding **return_list,
  796. int *return_size, bool persistent) {
  797. bool ret;
  798. if (encoding.is(KindOfArray)) {
  799. ret = php_mb_parse_encoding_array(encoding.toArray(),
  800. return_list, return_size,
  801. persistent ? 1 : 0);
  802. } else {
  803. String enc = encoding.toString();
  804. ret = php_mb_parse_encoding_list(enc.data(), enc.size(),
  805. return_list, return_size,
  806. persistent ? 1 : 0);
  807. }
  808. if (!ret) {
  809. if (return_list && *return_list) {
  810. free(*return_list);
  811. *return_list = NULL;
  812. }
  813. return_size = 0;
  814. }
  815. return ret;
  816. }
  817. static int php_mb_nls_get_default_detect_order_list(mbfl_no_language lang,
  818. mbfl_no_encoding **plist,
  819. int* plist_size) {
  820. size_t i;
  821. *plist = (mbfl_no_encoding *) php_mb_default_identify_list_neut;
  822. *plist_size = sizeof(php_mb_default_identify_list_neut) /
  823. sizeof(php_mb_default_identify_list_neut[0]);
  824. for (i = 0; i < sizeof(php_mb_default_identify_list) /
  825. sizeof(php_mb_default_identify_list[0]); i++) {
  826. if (php_mb_default_identify_list[i].lang == lang) {
  827. *plist = php_mb_default_identify_list[i].list;
  828. *plist_size = php_mb_default_identify_list[i].list_size;
  829. return 1;
  830. }
  831. }
  832. return 0;
  833. }
  834. static size_t php_mb_mbchar_bytes_ex(const char *s, const mbfl_encoding *enc) {
  835. if (enc != NULL) {
  836. if (enc->flag & MBFL_ENCTYPE_MBCS) {
  837. if (enc->mblen_table != NULL) {
  838. if (s != NULL) return enc->mblen_table[*(unsigned char *)s];
  839. }
  840. } else if (enc->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
  841. return 2;
  842. } else if (enc->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
  843. return 4;
  844. }
  845. }
  846. return 1;
  847. }
  848. static int php_mb_stripos(int mode,
  849. const char *old_haystack, int old_haystack_len,
  850. const char *old_needle, int old_needle_len,
  851. long offset, const char *from_encoding) {
  852. int n;
  853. mbfl_string haystack, needle;
  854. n = -1;
  855. mbfl_string_init(&haystack);
  856. mbfl_string_init(&needle);
  857. haystack.no_language = MBSTRG(current_language);
  858. haystack.no_encoding = MBSTRG(current_internal_encoding);
  859. needle.no_language = MBSTRG(current_language);
  860. needle.no_encoding = MBSTRG(current_internal_encoding);
  861. do {
  862. haystack.val = (unsigned char *)php_unicode_convert_case
  863. (PHP_UNICODE_CASE_UPPER, old_haystack, (size_t)old_haystack_len,
  864. &haystack.len, from_encoding);
  865. if (!haystack.val) {
  866. break;
  867. }
  868. if (haystack.len <= 0) {
  869. break;
  870. }
  871. needle.val = (unsigned char *)php_unicode_convert_case
  872. (PHP_UNICODE_CASE_UPPER, old_needle, (size_t)old_needle_len,
  873. &needle.len, from_encoding);
  874. if (!needle.val) {
  875. break;
  876. }
  877. if (needle.len <= 0) {
  878. break;
  879. }
  880. haystack.no_encoding = needle.no_encoding =
  881. mbfl_name2no_encoding(from_encoding);
  882. if (haystack.no_encoding == mbfl_no_encoding_invalid) {
  883. raise_warning("Unknown encoding \"%s\"", from_encoding);
  884. break;
  885. }
  886. int haystack_char_len = mbfl_strlen(&haystack);
  887. if (mode) {
  888. if ((offset > 0 && offset > haystack_char_len) ||
  889. (offset < 0 && -offset > haystack_char_len)) {
  890. raise_warning("Offset is greater than the length of haystack string");
  891. break;
  892. }
  893. } else {
  894. if (offset < 0 || offset > haystack_char_len) {
  895. raise_warning("Offset not contained in string.");
  896. break;
  897. }
  898. }
  899. n = mbfl_strpos(&haystack, &needle, offset, mode);
  900. } while(0);
  901. if (haystack.val) {
  902. free(haystack.val);
  903. }
  904. if (needle.val) {
  905. free(needle.val);
  906. }
  907. return n;
  908. }
  909. ///////////////////////////////////////////////////////////////////////////////
  910. Array f_mb_list_encodings() {
  911. Array ret;
  912. int i = 0;
  913. const mbfl_encoding **encodings = mbfl_get_supported_encodings();
  914. const mbfl_encoding *encoding;
  915. while ((encoding = encodings[i++]) != NULL) {
  916. ret.append(String(encoding->name, CopyString));
  917. }
  918. return ret;
  919. }
  920. Variant f_mb_list_encodings_alias_names(const String& name /*= null_string*/) {
  921. const mbfl_encoding **encodings;
  922. const mbfl_encoding *encoding;
  923. mbfl_no_encoding no_encoding;
  924. int i, j;
  925. Array ret;
  926. if (name.isNull()) {
  927. i = 0;
  928. encodings = mbfl_get_supported_encodings();
  929. while ((encoding = encodings[i++]) != NULL) {
  930. Array row;
  931. if (encoding->aliases != NULL) {
  932. j = 0;
  933. while ((*encoding->aliases)[j] != NULL) {
  934. row.append(String((*encoding->aliases)[j], CopyString));
  935. j++;
  936. }
  937. }
  938. ret.set(String(encoding->name, CopyString), row);
  939. }
  940. } else {
  941. no_encoding = mbfl_name2no_encoding(name.data());
  942. if (no_encoding == mbfl_no_encoding_invalid) {
  943. raise_warning("Unknown encoding \"%s\"", name.data());
  944. return false;
  945. }
  946. char *name = (char *)mbfl_no_encoding2name(no_encoding);
  947. if (name != NULL) {
  948. i = 0;
  949. encodings = mbfl_get_supported_encodings();
  950. while ((encoding = encodings[i++]) != NULL) {
  951. if (strcmp(encoding->name, name) != 0) continue;
  952. if (encoding->aliases != NULL) {
  953. j = 0;
  954. while ((*encoding->aliases)[j] != NULL) {
  955. ret.append(String((*encoding->aliases)[j], CopyString));
  956. j++;
  957. }
  958. }
  959. break;
  960. }
  961. } else {
  962. return false;
  963. }
  964. }
  965. return ret;
  966. }
  967. Variant f_mb_list_mime_names(const String& name /* = null_string */) {
  968. const mbfl_encoding **encodings;
  969. const mbfl_encoding *encoding;
  970. mbfl_no_encoding no_encoding;
  971. int i;
  972. Array ret;
  973. if (name.isNull()) {
  974. i = 0;
  975. encodings = mbfl_get_supported_encodings();
  976. while ((encoding = encodings[i++]) != NULL) {
  977. if (encoding->mime_name != NULL) {
  978. ret.set(String(encoding->name, CopyString),
  979. String(encoding->mime_name, CopyString));
  980. } else{
  981. ret.set(String(encoding->name, CopyString), "");
  982. }
  983. }
  984. } else {
  985. no_encoding = mbfl_name2no_encoding(name.data());
  986. if (no_encoding == mbfl_no_encoding_invalid) {
  987. raise_warning("Unknown encoding \"%s\"", name.data());
  988. return false;
  989. }
  990. char *name = (char *)mbfl_no_encoding2name(no_encoding);
  991. if (name != NULL) {
  992. i = 0;
  993. encodings = mbfl_get_supported_encodings();
  994. while ((encoding = encodings[i++]) != NULL) {
  995. if (strcmp(encoding->name, name) != 0) continue;
  996. if (encoding->mime_name != NULL) {
  997. return String(encoding->mime_name, CopyString);
  998. }
  999. break;
  1000. }
  1001. return "";
  1002. } else {
  1003. return false;
  1004. }
  1005. }
  1006. return ret;
  1007. }
  1008. bool f_mb_check_encoding(const String& var /* = null_string */,
  1009. const String& encoding /* = null_string */) {
  1010. mbfl_buffer_converter *convd;
  1011. mbfl_no_encoding no_encoding = MBSTRG(current_internal_encoding);
  1012. mbfl_string string, result, *ret = NULL;
  1013. long illegalchars = 0;
  1014. if (var.isNull()) {
  1015. return MBSTRG(illegalchars) == 0;
  1016. }
  1017. if (!encoding.isNull()) {
  1018. no_encoding = mbfl_name2no_encoding(encoding.data());
  1019. if (no_encoding == mbfl_no_encoding_invalid ||
  1020. no_encoding == mbfl_no_encoding_pass) {
  1021. raise_warning("Invalid encoding \"%s\"", encoding.data());
  1022. return false;
  1023. }
  1024. }
  1025. convd = mbfl_buffer_converter_new(no_encoding, no_encoding, 0);
  1026. if (convd == NULL) {
  1027. raise_warning("Unable to create converter");
  1028. return false;
  1029. }
  1030. mbfl_buffer_converter_illegal_mode
  1031. (convd, MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE);
  1032. mbfl_buffer_converter_illegal_substchar
  1033. (convd, 0);
  1034. /* initialize string */
  1035. mbfl_string_init_set(&string, mbfl_no_language_neutral, no_encoding);
  1036. mbfl_string_init(&result);
  1037. string.val = (unsigned char *)var.data();
  1038. string.len = var.size();
  1039. ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
  1040. illegalchars = mbfl_buffer_illegalchars(convd);
  1041. mbfl_buffer_converter_delete(convd);
  1042. if (ret != NULL) {
  1043. MBSTRG(illegalchars) += illegalchars;
  1044. if (illegalchars == 0 && string.len == ret->len &&
  1045. memcmp((const char *)string.val, (const char *)ret->val,
  1046. string.len) == 0) {
  1047. mbfl_string_clear(&result);
  1048. return true;
  1049. } else {
  1050. mbfl_string_clear(&result);
  1051. return false;
  1052. }
  1053. } else {
  1054. return false;
  1055. }
  1056. }
  1057. Variant f_mb_convert_case(const String& str, int mode,
  1058. const String& encoding /* = null_string */) {
  1059. const char *enc = NULL;
  1060. if (encoding.empty()) {
  1061. enc = mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
  1062. }
  1063. unsigned int ret_len;
  1064. char *newstr = php_unicode_convert_case(mode, str.data(), str.size(),
  1065. &ret_len, enc);
  1066. if (newstr) {
  1067. return String(newstr, ret_len, AttachString);
  1068. }
  1069. return false;
  1070. }
  1071. Variant f_mb_convert_encoding(const String& str, const String& to_encoding,
  1072. CVarRef from_encoding /* = null_variant */) {
  1073. String encoding = from_encoding.toString();
  1074. if (from_encoding.is(KindOfArray)) {
  1075. StringBuffer _from_encodings;
  1076. Array encs = from_encoding.toArray();
  1077. for (ArrayIter iter(encs); iter; ++iter) {
  1078. if (!_from_encodings.empty()) {
  1079. _from_encodings.append(",");
  1080. }
  1081. _from_encodings.append(iter.second().toString());
  1082. }
  1083. encoding = _from_encodings.detach();
  1084. }
  1085. unsigned int size;
  1086. char *ret = php_mb_convert_encoding(str.data(), str.size(),
  1087. to_encoding.data(),
  1088. (!encoding.empty() ?
  1089. encoding.data() : NULL),
  1090. &size);
  1091. if (ret != NULL) {
  1092. return String(ret, size, AttachString);
  1093. }
  1094. return false;
  1095. }
  1096. Variant f_mb_convert_kana(const String& str,
  1097. const String& option /* = null_string */,
  1098. const String& encoding /* = null_string */) {
  1099. mbfl_string string, result, *ret;
  1100. mbfl_string_init(&string);
  1101. string.no_language = MBSTRG(current_language);
  1102. string.no_encoding = MBSTRG(current_internal_encoding);
  1103. string.val = (unsigned char *)str.data();
  1104. string.len = str.size();
  1105. int opt = 0x900;
  1106. if (!option.empty()) {
  1107. const char *p = option.data();
  1108. int n = option.size();
  1109. int i = 0;
  1110. opt = 0;
  1111. while (i < n) {
  1112. i++;
  1113. switch (*p++) {
  1114. case 'A': opt |= 0x1; break;
  1115. case 'a': opt |= 0x10; break;
  1116. case 'R': opt |= 0x2; break;
  1117. case 'r': opt |= 0x20; break;
  1118. case 'N': opt |= 0x4; break;
  1119. case 'n': opt |= 0x40; break;
  1120. case 'S': opt |= 0x8; break;
  1121. case 's': opt |= 0x80; break;
  1122. case 'K': opt |= 0x100; break;
  1123. case 'k': opt |= 0x1000; break;
  1124. case 'H': opt |= 0x200; break;
  1125. case 'h': opt |= 0x2000; break;
  1126. case 'V': opt |= 0x800; break;
  1127. case 'C': opt |= 0x10000; break;
  1128. case 'c': opt |= 0x20000; break;
  1129. case 'M': opt |= 0x100000; break;
  1130. case 'm': opt |= 0x200000; break;
  1131. }
  1132. }
  1133. }
  1134. /* encoding */
  1135. if (!encoding.empty()) {
  1136. string.no_encoding = mbfl_name2no_encoding(encoding.data());
  1137. if (string.no_encoding == mbfl_no_encoding_invalid) {
  1138. raise_warning("Unknown encoding \"%s\"", encoding.data());
  1139. return false;
  1140. }
  1141. }
  1142. ret = mbfl_ja_jp_hantozen(&string, &result, opt);
  1143. if (ret != NULL) {
  1144. return String(reinterpret_cast<char*>(ret->val), ret->len, AttachString);
  1145. }
  1146. return false;
  1147. }
  1148. static bool php_mbfl_encoding_detect(CVarRef var,
  1149. mbfl_encoding_detector *identd,
  1150. mbfl_string *string) {
  1151. if (var.is(KindOfArray) || var.is(KindOfObject)) {
  1152. Array items = var.toArray();
  1153. for (ArrayIter iter(items); iter; ++iter) {
  1154. if (php_mbfl_encoding_detect(iter.second(), identd, string)) {
  1155. return true;
  1156. }
  1157. }
  1158. } else if (var.isString()) {
  1159. String svar = var.toString();
  1160. string->val = (unsigned char *)svar.data();
  1161. string->len = svar.size();
  1162. if (mbfl_encoding_detector_feed(identd, string)) {
  1163. return true;
  1164. }
  1165. }
  1166. return false;
  1167. }
  1168. static Variant php_mbfl_convert(CVarRef var,
  1169. mbfl_buffer_converter *convd,
  1170. mbfl_string *string,
  1171. mbfl_string *result) {
  1172. if (var.is(KindOfArray)) {
  1173. Array ret;
  1174. Array items = var.toArray();
  1175. for (ArrayIter iter(items); iter; ++iter) {
  1176. ret.set(iter.first(),
  1177. php_mbfl_convert(iter.second(), convd, string, result));
  1178. }
  1179. return ret;
  1180. }
  1181. if (var.is(KindOfObject)) {
  1182. Object obj = var.toObject();
  1183. Array items = var.toArray();
  1184. for (ArrayIter iter(items); iter; ++iter) {
  1185. obj->o_set(iter.first().toString(),
  1186. php_mbfl_convert(iter.second().toString().data(), convd,
  1187. string, result));
  1188. }
  1189. return var; // which still has obj
  1190. }
  1191. if (var.isString()) {
  1192. String svar = var.toString();
  1193. string->val = (unsigned char *)svar.data();
  1194. string->len = svar.size();
  1195. mbfl_string *ret =
  1196. mbfl_buffer_converter_feed_result(convd, string, result);
  1197. return String(reinterpret_cast<char*>(ret->val), ret->len, AttachString);
  1198. }
  1199. return var;
  1200. }
  1201. Variant f_mb_convert_variables(int _argc, const String& to_encoding,
  1202. CVarRef from_encoding, VRefParam vars,
  1203. CArrRef _argv /* = null_array */) {
  1204. mbfl_string string, result;
  1205. mbfl_no_encoding _from_encoding, _to_encoding;
  1206. mbfl_encoding_detector *identd;
  1207. mbfl_buffer_converter *convd;
  1208. int elistsz;
  1209. mbfl_no_encoding *elist;
  1210. char *name;
  1211. /* new encoding */
  1212. _to_encoding = mbfl_name2no_encoding(to_encoding.data());
  1213. if (_to_encoding == mbfl_no_encoding_invalid) {
  1214. raise_warning("Unknown encoding \"%s\"", to_encoding.data());
  1215. return false;
  1216. }
  1217. /* initialize string */
  1218. mbfl_string_init(&string);
  1219. mbfl_string_init(&result);
  1220. _from_encoding = MBSTRG(current_internal_encoding);
  1221. string.no_encoding = _from_encoding;
  1222. string.no_language = MBSTRG(current_language);
  1223. /* pre-conversion encoding */
  1224. elist = NULL;
  1225. elistsz = 0;
  1226. php_mb_parse_encoding(from_encoding, &elist, &elistsz, false);
  1227. if (elistsz <= 0) {
  1228. _from_encoding = mbfl_no_encoding_pass;
  1229. } else if (elistsz == 1) {
  1230. _from_encoding = *elist;
  1231. } else {
  1232. /* auto detect */
  1233. _from_encoding = mbfl_no_encoding_invalid;
  1234. identd = mbfl_encoding_detector_new(elist, elistsz,
  1235. MBSTRG(strict_detection));
  1236. if (identd != NULL) {
  1237. for (int n = -1; n < _argv.size(); n++) {
  1238. if (php_mbfl_encoding_detect(n < 0 ? (Variant&)vars : _argv[n],
  1239. identd, &string)) {
  1240. break;
  1241. }
  1242. }
  1243. _from_encoding = mbfl_encoding_detector_judge(identd);
  1244. mbfl_encoding_detector_delete(identd);
  1245. }
  1246. if (_from_encoding == mbfl_no_encoding_invalid) {
  1247. raise_warning("Unable to detect encoding");
  1248. _from_encoding = mbfl_no_encoding_pass;
  1249. }
  1250. }
  1251. if (elist != NULL) {
  1252. free((void *)elist);
  1253. }
  1254. /* create converter */
  1255. convd = NULL;
  1256. if (_from_encoding != mbfl_no_encoding_pass) {
  1257. convd = mbfl_buffer_converter_new(_from_encoding, _to_encoding, 0);
  1258. if (convd == NULL) {
  1259. raise_warning("Unable to create converter");
  1260. return false;
  1261. }
  1262. mbfl_buffer_converter_illegal_mode
  1263. (convd, MBSTRG(current_filter_illegal_mode));
  1264. mbfl_buffer_converter_illegal_substchar
  1265. (convd, MBSTRG(current_filter_illegal_substchar));
  1266. }
  1267. /* convert */
  1268. if (convd != NULL) {
  1269. vars = php_mbfl_convert(vars, convd, &string, &result);
  1270. for (int n = 0; n < _argv.size(); n++) {
  1271. const_cast<Array&>(_argv).lval(n) =
  1272. php_mbfl_convert(_argv[n], convd, &string, &result);
  1273. }
  1274. MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
  1275. mbfl_buffer_converter_delete(convd);
  1276. }
  1277. name = (char *)mbfl_no_encoding2name(_from_encoding);
  1278. if (name != NULL) {
  1279. return String(name, CopyString);
  1280. }
  1281. return false;
  1282. }
  1283. Variant f_mb_decode_mimeheader(const String& str) {
  1284. mbfl_string string, result, *ret;
  1285. mbfl_string_init(&string);
  1286. string.no_language = MBSTRG(current_language);
  1287. string.no_encoding = MBSTRG(current_internal_encoding);
  1288. string.val = (unsigned char *)str.data();
  1289. string.len = str.size();
  1290. mbfl_string_init(&result);
  1291. ret = mbfl_mime_header_decode(&string, &result,
  1292. MBSTRG(current_internal_encoding));
  1293. if (ret != NULL) {
  1294. return String(reinterpret_cast<char*>(ret->val), ret->len, AttachString);
  1295. }
  1296. return false;
  1297. }
  1298. static Variant php_mb_numericentity_exec(const String& str, CVarRef convmap,
  1299. const String& encoding, int type) {
  1300. int mapsize=0;
  1301. mbfl_string string, result, *ret;
  1302. mbfl_no_encoding no_encoding;
  1303. mbfl_string_init(&string);
  1304. string.no_language = MBSTRG(current_language);
  1305. string.no_encoding = MBSTRG(current_internal_encoding);
  1306. string.val = (unsigned char *)str.data();
  1307. string.len = str.size();
  1308. /* encoding */
  1309. if (!encoding.empty()) {
  1310. no_encoding = mbfl_name2no_encoding(encoding.data());
  1311. if (no_encoding == mbfl_no_encoding_invalid) {
  1312. raise_warning("Unknown encoding \"%s\"", encoding.data());
  1313. return false;
  1314. } else {
  1315. string.no_encoding = no_encoding;
  1316. }
  1317. }
  1318. /* conversion map */
  1319. int *iconvmap = NULL;
  1320. if (convmap.is(KindOfArray)) {
  1321. Array convs = convmap.toArray();
  1322. mapsize = convs.size();
  1323. if (mapsize > 0) {
  1324. iconvmap = (int*)malloc(mapsize * sizeof(int));
  1325. int *mapelm = iconvmap;
  1326. for (ArrayIter iter(convs); iter; ++iter) {
  1327. *mapelm++ = iter.second().toInt32();
  1328. }
  1329. }
  1330. }
  1331. if (iconvmap == NULL) {
  1332. return false;
  1333. }
  1334. mapsize /= 4;
  1335. ret = mbfl_html_numeric_entity(&string, &result, iconvmap, mapsize, type);
  1336. free(iconvmap);
  1337. if (ret != NULL) {
  1338. return String(reinterpret_cast<char*>(ret->val), ret->len, AttachString);
  1339. }
  1340. return false;
  1341. }
  1342. Variant f_mb_decode_numericentity(const String& str, CVarRef convmap,
  1343. const String& encoding /* = null_string */) {
  1344. return php_mb_numericentity_exec(str, convmap, encoding, 1);
  1345. }
  1346. Variant f_mb_detect_encoding(const String& str,
  1347. CVarRef encoding_list /* = null_variant */,
  1348. CVarRef strict /* = null_variant */) {
  1349. mbfl_string string;
  1350. const char *ret;
  1351. mbfl_no_encoding *elist;
  1352. int size;
  1353. mbfl_no_encoding *list = 0;
  1354. /* make encoding list */
  1355. list = NULL;
  1356. size = 0;
  1357. php_mb_parse_encoding(encoding_list, &list, &size, false);
  1358. if (size > 0 && list != NULL) {
  1359. elist = list;
  1360. } else {
  1361. elist = MBSTRG(current_detect_order_list);
  1362. size = MBSTRG(current_detect_order_list_size);
  1363. }
  1364. long nstrict = 0;
  1365. if (!strict.isNull()) {
  1366. nstrict = strict.toInt64();
  1367. } else {
  1368. nstrict = MBSTRG(strict_detection);
  1369. }
  1370. mbfl_string_init(&string);
  1371. string.no_language = MBSTRG(current_language);
  1372. string.val = (unsigned char *)str.data();
  1373. string.len = str.size();
  1374. ret = mbfl_identify_encoding_name(&string, elist, size, nstrict);
  1375. if (list != NULL) {
  1376. free(list);
  1377. }
  1378. if (ret != NULL) {
  1379. return String(ret, CopyString);
  1380. }
  1381. return false;
  1382. }
  1383. Variant f_mb_detect_order(CVarRef encoding_list /* = null_variant */) {
  1384. int n, size;
  1385. mbfl_no_encoding *list, *entry;
  1386. if (encoding_list.isNull()) {
  1387. Array ret;
  1388. entry = MBSTRG(current_detect_order_list);
  1389. n = MBSTRG(current_detect_order_list_size);
  1390. while (n > 0) {
  1391. char *name = (char *)mbfl_no_encoding2name(*entry);
  1392. if (name) {
  1393. ret.append(String(name, CopyString));
  1394. }
  1395. entry++;
  1396. n--;
  1397. }
  1398. return ret;
  1399. }
  1400. list = NULL;
  1401. size = 0;
  1402. if (!php_mb_parse_encoding(encoding_list, &list, &size, false) ||
  1403. list == NULL) {
  1404. return false;
  1405. }
  1406. if (MBSTRG(current_detect_order_list)) {
  1407. free(MBSTRG(current_detect_order_list));
  1408. }
  1409. MBSTRG(current_detect_order_list) = list;
  1410. MBSTRG(current_detect_order_list_size) = size;
  1411. return true;
  1412. }
  1413. Variant f_mb_encode_mimeheader(const String& str,
  1414. const String& charset /* = null_string */,
  1415. const String& transfer_encoding/*= null_string*/,
  1416. const String& linefeed /* = "\r\n" */,
  1417. int indent /* = 0 */) {
  1418. mbfl_no_encoding charsetenc, transenc;
  1419. mbfl_string string, result, *ret;
  1420. mbfl_string_init(&string);
  1421. string.no_language = MBSTRG(current_language);
  1422. string.no_encoding = MBSTRG(current_internal_encoding);
  1423. string.val = (unsigned char *)str.data();
  1424. string.len = str.size();
  1425. charsetenc = mbfl_no_encoding_pass;
  1426. transenc = mbfl_no_encoding_base64;
  1427. if (!charset.empty()) {
  1428. charsetenc = mbfl_name2no_encoding(charset.data());
  1429. if (charsetenc == mbfl_no_encoding_invalid) {
  1430. raise_warning("Unknown encoding \"%s\"", charset.data());
  1431. return false;
  1432. }
  1433. } else {
  1434. const mbfl_language *lang = mbfl_no2language(MBSTRG(current_language));
  1435. if (lang != NULL) {
  1436. charsetenc = lang->mail_charset;
  1437. transenc = lang->mail_header_encoding;
  1438. }
  1439. }
  1440. if (!transfer_encoding.empty()) {
  1441. char ch = *transfer_encoding.data();
  1442. if (ch == 'B' || ch == 'b') {
  1443. transenc = mbfl_no_encoding_base64;
  1444. } else if (ch == 'Q' || ch == 'q') {
  1445. transenc = mbfl_no_encoding_qprint;
  1446. }
  1447. }
  1448. mbfl_string_init(&result);
  1449. ret = mbfl_mime_header_encode(&string, &result, charsetenc, transenc,
  1450. linefeed.data(), indent);
  1451. if (ret != NULL) {
  1452. return String(reinterpret_cast<char*>(ret->val), ret->len, AttachString);
  1453. }
  1454. return false;
  1455. }
  1456. Variant f_mb_encode_numericentity(const String& str, CVarRef convmap,
  1457. const String& encoding /* = null_string */) {
  1458. return php_mb_numericentity_exec(str, convmap, encoding, 0);
  1459. }
  1460. const StaticString
  1461. s_internal_encoding("internal_encoding"),
  1462. s_http_input("http_input"),
  1463. s_http_output("http_output"),
  1464. s_mail_charset("mail_charset"),
  1465. s_mail_header_encoding("mail_header_encoding"),
  1466. s_mail_body_encoding("mail_body_encoding"),
  1467. s_illegal_chars("illegal_chars"),
  1468. s_encoding_translation("encoding_translation"),
  1469. s_On("On"),
  1470. s_Off("Off"),
  1471. s_language("language"),
  1472. s_detect_order("detect_order"),
  1473. s_substitute_character("substitute_character"),
  1474. s_strict_detection("strict_detection"),
  1475. s_none("none"),
  1476. s_long("long"),
  1477. s_entity("entity");
  1478. Variant f_mb_get_info(const String& type /* = null_string */) {
  1479. const mbfl_language *lang = mbfl_no2language(MBSTRG(current_language));
  1480. mbfl_no_encoding *entry;
  1481. int n;
  1482. char *name;
  1483. if (type.empty() || strcasecmp(type.data(), "all") == 0) {
  1484. Array ret;
  1485. if ((name = (char *)mbfl_no_encoding2name
  1486. (MBSTRG(current_internal_encoding))) != NULL) {
  1487. ret.set(s_internal_encoding, String(name, CopyString));
  1488. }
  1489. if ((name = (char *)mbfl_no_encoding2name
  1490. (MBSTRG(http_input_identify))) != NULL) {
  1491. ret.set(s_http_input, String(name, CopyString));
  1492. }
  1493. if ((name = (char *)mbfl_no_encoding2name
  1494. (MBSTRG(current_http_output_encoding))) != NULL) {
  1495. ret.set(s_http_output, String(name, CopyString));
  1496. }
  1497. if (lang != NULL) {
  1498. if ((name = (char *)mbfl_no_encoding2name
  1499. (lang->mail_charset)) != NULL) {
  1500. ret.set(s_mail_charset, String(name, CopyString));
  1501. }
  1502. if ((name = (char *)mbfl_no_encoding2name
  1503. (lang->mail_header_encoding)) != NULL) {
  1504. ret.set(s_mail_header_encoding, String(name, CopyString));
  1505. }
  1506. if ((name = (char *)mbfl_no_encoding2name
  1507. (lang->mail_body_encoding)) != NULL) {
  1508. ret.set(s_mail_body_encoding, String(name, CopyString));
  1509. }
  1510. }
  1511. ret.se

Large files files are truncated, but you can click here to view the full file