PageRenderTime 69ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/ext/mbstring/php_mbregex.c

http://github.com/php/php-src
C | 1675 lines | 1321 code | 174 blank | 180 comment | 325 complexity | 07a20f3c02bf661f9658f8fd02db53cb MD5 | raw file
Possible License(s): BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. /*
  2. +----------------------------------------------------------------------+
  3. | Copyright (c) The PHP Group |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
  14. +----------------------------------------------------------------------+
  15. */
  16. #ifdef HAVE_CONFIG_H
  17. #include "config.h"
  18. #endif
  19. #include "php.h"
  20. #include "php_ini.h"
  21. #if HAVE_MBREGEX
  22. #include "zend_smart_str.h"
  23. #include "ext/standard/info.h"
  24. #include "php_mbregex.h"
  25. #include "mbstring.h"
  26. #include "libmbfl/filters/mbfilter_utf8.h"
  27. #include "php_onig_compat.h" /* must come prior to the oniguruma header */
  28. #include <oniguruma.h>
  29. #undef UChar
  30. #if ONIGURUMA_VERSION_INT < 60800
  31. typedef void OnigMatchParam;
  32. #define onig_new_match_param() (NULL)
  33. #define onig_initialize_match_param(x) (void)(x)
  34. #define onig_set_match_stack_limit_size_of_match_param(x, y)
  35. #define onig_set_retry_limit_in_match_of_match_param(x, y)
  36. #define onig_free_match_param(x)
  37. #define onig_search_with_param(reg, str, end, start, range, region, option, mp) \
  38. onig_search(reg, str, end, start, range, region, option)
  39. #define onig_match_with_param(re, str, end, at, region, option, mp) \
  40. onig_match(re, str, end, at, region, option)
  41. #endif
  42. ZEND_EXTERN_MODULE_GLOBALS(mbstring)
  43. struct _zend_mb_regex_globals {
  44. OnigEncoding default_mbctype;
  45. OnigEncoding current_mbctype;
  46. const mbfl_encoding *current_mbctype_mbfl_encoding;
  47. HashTable ht_rc;
  48. zval search_str;
  49. zval *search_str_val;
  50. size_t search_pos;
  51. php_mb_regex_t *search_re;
  52. OnigRegion *search_regs;
  53. OnigOptionType regex_default_options;
  54. OnigSyntaxType *regex_default_syntax;
  55. };
  56. #define MBREX(g) (MBSTRG(mb_regex_globals)->g)
  57. /* {{{ static void php_mb_regex_free_cache() */
  58. static void php_mb_regex_free_cache(zval *el) {
  59. onig_free((php_mb_regex_t *)Z_PTR_P(el));
  60. }
  61. /* }}} */
  62. /* {{{ _php_mb_regex_globals_ctor */
  63. static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
  64. {
  65. pglobals->default_mbctype = ONIG_ENCODING_UTF8;
  66. pglobals->current_mbctype = ONIG_ENCODING_UTF8;
  67. pglobals->current_mbctype_mbfl_encoding = &mbfl_encoding_utf8;
  68. ZVAL_UNDEF(&pglobals->search_str);
  69. pglobals->search_re = (php_mb_regex_t*)NULL;
  70. pglobals->search_pos = 0;
  71. pglobals->search_regs = (OnigRegion*)NULL;
  72. pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  73. pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY;
  74. return SUCCESS;
  75. }
  76. /* }}} */
  77. /* {{{ _php_mb_regex_globals_dtor */
  78. static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
  79. {
  80. }
  81. /* }}} */
  82. /* {{{ php_mb_regex_globals_alloc */
  83. zend_mb_regex_globals *php_mb_regex_globals_alloc(void)
  84. {
  85. zend_mb_regex_globals *pglobals = pemalloc(
  86. sizeof(zend_mb_regex_globals), 1);
  87. if (SUCCESS != _php_mb_regex_globals_ctor(pglobals)) {
  88. pefree(pglobals, 1);
  89. return NULL;
  90. }
  91. return pglobals;
  92. }
  93. /* }}} */
  94. /* {{{ php_mb_regex_globals_free */
  95. void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals)
  96. {
  97. if (!pglobals) {
  98. return;
  99. }
  100. _php_mb_regex_globals_dtor(pglobals);
  101. pefree(pglobals, 1);
  102. }
  103. /* }}} */
  104. /* {{{ PHP_MINIT_FUNCTION(mb_regex) */
  105. PHP_MINIT_FUNCTION(mb_regex)
  106. {
  107. char version[256];
  108. onig_init();
  109. snprintf(version, sizeof(version), "%d.%d.%d",
  110. ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY);
  111. REGISTER_STRING_CONSTANT("MB_ONIGURUMA_VERSION", version, CONST_CS | CONST_PERSISTENT);
  112. return SUCCESS;
  113. }
  114. /* }}} */
  115. /* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */
  116. PHP_MSHUTDOWN_FUNCTION(mb_regex)
  117. {
  118. onig_end();
  119. return SUCCESS;
  120. }
  121. /* }}} */
  122. /* {{{ PHP_RINIT_FUNCTION(mb_regex) */
  123. PHP_RINIT_FUNCTION(mb_regex)
  124. {
  125. if (!MBSTRG(mb_regex_globals)) return FAILURE;
  126. zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
  127. return SUCCESS;
  128. }
  129. /* }}} */
  130. /* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */
  131. PHP_RSHUTDOWN_FUNCTION(mb_regex)
  132. {
  133. MBREX(current_mbctype) = MBREX(default_mbctype);
  134. MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(php_mb_regex_get_default_mbctype());
  135. if (!Z_ISUNDEF(MBREX(search_str))) {
  136. zval_ptr_dtor(&MBREX(search_str));
  137. ZVAL_UNDEF(&MBREX(search_str));
  138. }
  139. MBREX(search_pos) = 0;
  140. MBREX(search_re) = NULL;
  141. if (MBREX(search_regs) != NULL) {
  142. onig_region_free(MBREX(search_regs), 1);
  143. MBREX(search_regs) = (OnigRegion *)NULL;
  144. }
  145. zend_hash_destroy(&MBREX(ht_rc));
  146. return SUCCESS;
  147. }
  148. /* }}} */
  149. /* {{{ PHP_MINFO_FUNCTION(mb_regex) */
  150. PHP_MINFO_FUNCTION(mb_regex)
  151. {
  152. char buf[32];
  153. php_info_print_table_start();
  154. php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled");
  155. snprintf(buf, sizeof(buf), "%d.%d.%d",
  156. ONIGURUMA_VERSION_MAJOR,
  157. ONIGURUMA_VERSION_MINOR,
  158. ONIGURUMA_VERSION_TEENY);
  159. php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf);
  160. php_info_print_table_end();
  161. }
  162. /* }}} */
  163. /*
  164. * encoding name resolver
  165. */
  166. /* {{{ encoding name map */
  167. typedef struct _php_mb_regex_enc_name_map_t {
  168. const char *names;
  169. OnigEncoding code;
  170. } php_mb_regex_enc_name_map_t;
  171. static const php_mb_regex_enc_name_map_t enc_name_map[] = {
  172. #ifdef ONIG_ENCODING_EUC_JP
  173. {
  174. "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
  175. ONIG_ENCODING_EUC_JP
  176. },
  177. #endif
  178. #ifdef ONIG_ENCODING_UTF8
  179. {
  180. "UTF-8\0UTF8\0",
  181. ONIG_ENCODING_UTF8
  182. },
  183. #endif
  184. #ifdef ONIG_ENCODING_UTF16_BE
  185. {
  186. "UTF-16\0UTF-16BE\0",
  187. ONIG_ENCODING_UTF16_BE
  188. },
  189. #endif
  190. #ifdef ONIG_ENCODING_UTF16_LE
  191. {
  192. "UTF-16LE\0",
  193. ONIG_ENCODING_UTF16_LE
  194. },
  195. #endif
  196. #ifdef ONIG_ENCODING_UTF32_BE
  197. {
  198. "UCS-4\0UTF-32\0UTF-32BE\0",
  199. ONIG_ENCODING_UTF32_BE
  200. },
  201. #endif
  202. #ifdef ONIG_ENCODING_UTF32_LE
  203. {
  204. "UCS-4LE\0UTF-32LE\0",
  205. ONIG_ENCODING_UTF32_LE
  206. },
  207. #endif
  208. #ifdef ONIG_ENCODING_SJIS
  209. {
  210. "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
  211. ONIG_ENCODING_SJIS
  212. },
  213. #endif
  214. #ifdef ONIG_ENCODING_BIG5
  215. {
  216. "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
  217. ONIG_ENCODING_BIG5
  218. },
  219. #endif
  220. #ifdef ONIG_ENCODING_EUC_CN
  221. {
  222. "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
  223. ONIG_ENCODING_EUC_CN
  224. },
  225. #endif
  226. #ifdef ONIG_ENCODING_EUC_TW
  227. {
  228. "EUC-TW\0EUCTW\0EUC_TW\0",
  229. ONIG_ENCODING_EUC_TW
  230. },
  231. #endif
  232. #ifdef ONIG_ENCODING_EUC_KR
  233. {
  234. "EUC-KR\0EUCKR\0EUC_KR\0",
  235. ONIG_ENCODING_EUC_KR
  236. },
  237. #endif
  238. #if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY
  239. {
  240. "KOI8\0KOI-8\0",
  241. ONIG_ENCODING_KOI8
  242. },
  243. #endif
  244. #ifdef ONIG_ENCODING_KOI8_R
  245. {
  246. "KOI8R\0KOI8-R\0KOI-8R\0",
  247. ONIG_ENCODING_KOI8_R
  248. },
  249. #endif
  250. #ifdef ONIG_ENCODING_ISO_8859_1
  251. {
  252. "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
  253. ONIG_ENCODING_ISO_8859_1
  254. },
  255. #endif
  256. #ifdef ONIG_ENCODING_ISO_8859_2
  257. {
  258. "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
  259. ONIG_ENCODING_ISO_8859_2
  260. },
  261. #endif
  262. #ifdef ONIG_ENCODING_ISO_8859_3
  263. {
  264. "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
  265. ONIG_ENCODING_ISO_8859_3
  266. },
  267. #endif
  268. #ifdef ONIG_ENCODING_ISO_8859_4
  269. {
  270. "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
  271. ONIG_ENCODING_ISO_8859_4
  272. },
  273. #endif
  274. #ifdef ONIG_ENCODING_ISO_8859_5
  275. {
  276. "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
  277. ONIG_ENCODING_ISO_8859_5
  278. },
  279. #endif
  280. #ifdef ONIG_ENCODING_ISO_8859_6
  281. {
  282. "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
  283. ONIG_ENCODING_ISO_8859_6
  284. },
  285. #endif
  286. #ifdef ONIG_ENCODING_ISO_8859_7
  287. {
  288. "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
  289. ONIG_ENCODING_ISO_8859_7
  290. },
  291. #endif
  292. #ifdef ONIG_ENCODING_ISO_8859_8
  293. {
  294. "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
  295. ONIG_ENCODING_ISO_8859_8
  296. },
  297. #endif
  298. #ifdef ONIG_ENCODING_ISO_8859_9
  299. {
  300. "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
  301. ONIG_ENCODING_ISO_8859_9
  302. },
  303. #endif
  304. #ifdef ONIG_ENCODING_ISO_8859_10
  305. {
  306. "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
  307. ONIG_ENCODING_ISO_8859_10
  308. },
  309. #endif
  310. #ifdef ONIG_ENCODING_ISO_8859_11
  311. {
  312. "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
  313. ONIG_ENCODING_ISO_8859_11
  314. },
  315. #endif
  316. #ifdef ONIG_ENCODING_ISO_8859_13
  317. {
  318. "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
  319. ONIG_ENCODING_ISO_8859_13
  320. },
  321. #endif
  322. #ifdef ONIG_ENCODING_ISO_8859_14
  323. {
  324. "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
  325. ONIG_ENCODING_ISO_8859_14
  326. },
  327. #endif
  328. #ifdef ONIG_ENCODING_ISO_8859_15
  329. {
  330. "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
  331. ONIG_ENCODING_ISO_8859_15
  332. },
  333. #endif
  334. #ifdef ONIG_ENCODING_ISO_8859_16
  335. {
  336. "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
  337. ONIG_ENCODING_ISO_8859_16
  338. },
  339. #endif
  340. #ifdef ONIG_ENCODING_ASCII
  341. {
  342. "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
  343. ONIG_ENCODING_ASCII
  344. },
  345. #endif
  346. { NULL, ONIG_ENCODING_UNDEF }
  347. };
  348. /* }}} */
  349. /* {{{ php_mb_regex_name2mbctype */
  350. static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
  351. {
  352. const char *p;
  353. const php_mb_regex_enc_name_map_t *mapping;
  354. if (pname == NULL || !*pname) {
  355. return ONIG_ENCODING_UNDEF;
  356. }
  357. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  358. for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
  359. if (strcasecmp(p, pname) == 0) {
  360. return mapping->code;
  361. }
  362. }
  363. }
  364. return ONIG_ENCODING_UNDEF;
  365. }
  366. /* }}} */
  367. /* {{{ php_mb_regex_mbctype2name */
  368. static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
  369. {
  370. const php_mb_regex_enc_name_map_t *mapping;
  371. for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
  372. if (mapping->code == mbctype) {
  373. return mapping->names;
  374. }
  375. }
  376. return NULL;
  377. }
  378. /* }}} */
  379. /* {{{ php_mb_regex_set_mbctype */
  380. int php_mb_regex_set_mbctype(const char *encname)
  381. {
  382. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  383. if (mbctype == ONIG_ENCODING_UNDEF) {
  384. return FAILURE;
  385. }
  386. MBREX(current_mbctype) = mbctype;
  387. MBREX(current_mbctype_mbfl_encoding) = mbfl_name2encoding(encname);
  388. return SUCCESS;
  389. }
  390. /* }}} */
  391. /* {{{ php_mb_regex_set_default_mbctype */
  392. int php_mb_regex_set_default_mbctype(const char *encname)
  393. {
  394. OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname);
  395. if (mbctype == ONIG_ENCODING_UNDEF) {
  396. return FAILURE;
  397. }
  398. MBREX(default_mbctype) = mbctype;
  399. return SUCCESS;
  400. }
  401. /* }}} */
  402. /* {{{ php_mb_regex_get_mbctype */
  403. const char *php_mb_regex_get_mbctype(void)
  404. {
  405. return _php_mb_regex_mbctype2name(MBREX(current_mbctype));
  406. }
  407. /* }}} */
  408. /* {{{ php_mb_regex_get_mbctype_encoding */
  409. const mbfl_encoding *php_mb_regex_get_mbctype_encoding(void)
  410. {
  411. return MBREX(current_mbctype_mbfl_encoding);
  412. }
  413. /* }}} */
  414. /* {{{ php_mb_regex_get_default_mbctype */
  415. const char *php_mb_regex_get_default_mbctype(void)
  416. {
  417. return _php_mb_regex_mbctype2name(MBREX(default_mbctype));
  418. }
  419. /* }}} */
  420. /*
  421. * regex cache
  422. */
  423. /* {{{ php_mbregex_compile_pattern */
  424. static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax)
  425. {
  426. int err_code = 0;
  427. php_mb_regex_t *retval = NULL, *rc = NULL;
  428. OnigErrorInfo err_info;
  429. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  430. OnigEncoding enc = MBREX(current_mbctype);
  431. if (!php_mb_check_encoding(pattern, patlen, php_mb_regex_get_mbctype_encoding())) {
  432. php_error_docref(NULL, E_WARNING,
  433. "Pattern is not valid under %s encoding", _php_mb_regex_mbctype2name(enc));
  434. return NULL;
  435. }
  436. rc = zend_hash_str_find_ptr(&MBREX(ht_rc), (char *)pattern, patlen);
  437. if (!rc || onig_get_options(rc) != options || onig_get_encoding(rc) != enc || onig_get_syntax(rc) != syntax) {
  438. if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) {
  439. onig_error_code_to_str(err_str, err_code, &err_info);
  440. php_error_docref(NULL, E_WARNING, "mbregex compile err: %s", err_str);
  441. return NULL;
  442. }
  443. if (rc == MBREX(search_re)) {
  444. /* reuse the new rc? see bug #72399 */
  445. MBREX(search_re) = NULL;
  446. }
  447. zend_hash_str_update_ptr(&MBREX(ht_rc), (char *)pattern, patlen, retval);
  448. } else {
  449. retval = rc;
  450. }
  451. return retval;
  452. }
  453. /* }}} */
  454. /* {{{ _php_mb_regex_get_option_string */
  455. static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax)
  456. {
  457. size_t len_left = len;
  458. size_t len_req = 0;
  459. char *p = str;
  460. char c;
  461. if ((option & ONIG_OPTION_IGNORECASE) != 0) {
  462. if (len_left > 0) {
  463. --len_left;
  464. *(p++) = 'i';
  465. }
  466. ++len_req;
  467. }
  468. if ((option & ONIG_OPTION_EXTEND) != 0) {
  469. if (len_left > 0) {
  470. --len_left;
  471. *(p++) = 'x';
  472. }
  473. ++len_req;
  474. }
  475. if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
  476. (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
  477. if (len_left > 0) {
  478. --len_left;
  479. *(p++) = 'p';
  480. }
  481. ++len_req;
  482. } else {
  483. if ((option & ONIG_OPTION_MULTILINE) != 0) {
  484. if (len_left > 0) {
  485. --len_left;
  486. *(p++) = 'm';
  487. }
  488. ++len_req;
  489. }
  490. if ((option & ONIG_OPTION_SINGLELINE) != 0) {
  491. if (len_left > 0) {
  492. --len_left;
  493. *(p++) = 's';
  494. }
  495. ++len_req;
  496. }
  497. }
  498. if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
  499. if (len_left > 0) {
  500. --len_left;
  501. *(p++) = 'l';
  502. }
  503. ++len_req;
  504. }
  505. if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
  506. if (len_left > 0) {
  507. --len_left;
  508. *(p++) = 'n';
  509. }
  510. ++len_req;
  511. }
  512. c = 0;
  513. if (syntax == ONIG_SYNTAX_JAVA) {
  514. c = 'j';
  515. } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
  516. c = 'u';
  517. } else if (syntax == ONIG_SYNTAX_GREP) {
  518. c = 'g';
  519. } else if (syntax == ONIG_SYNTAX_EMACS) {
  520. c = 'c';
  521. } else if (syntax == ONIG_SYNTAX_RUBY) {
  522. c = 'r';
  523. } else if (syntax == ONIG_SYNTAX_PERL) {
  524. c = 'z';
  525. } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
  526. c = 'b';
  527. } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
  528. c = 'd';
  529. }
  530. if (c != 0) {
  531. if (len_left > 0) {
  532. --len_left;
  533. *(p++) = c;
  534. }
  535. ++len_req;
  536. }
  537. if (len_left > 0) {
  538. --len_left;
  539. *(p++) = '\0';
  540. }
  541. ++len_req;
  542. if (len < len_req) {
  543. return len_req;
  544. }
  545. return 0;
  546. }
  547. /* }}} */
  548. /* {{{ _php_mb_regex_init_options */
  549. static void
  550. _php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
  551. {
  552. size_t n;
  553. char c;
  554. OnigOptionType optm = 0;
  555. *syntax = ONIG_SYNTAX_RUBY;
  556. if (parg != NULL) {
  557. n = 0;
  558. while(n < narg) {
  559. c = parg[n++];
  560. switch (c) {
  561. case 'i':
  562. optm |= ONIG_OPTION_IGNORECASE;
  563. break;
  564. case 'x':
  565. optm |= ONIG_OPTION_EXTEND;
  566. break;
  567. case 'm':
  568. optm |= ONIG_OPTION_MULTILINE;
  569. break;
  570. case 's':
  571. optm |= ONIG_OPTION_SINGLELINE;
  572. break;
  573. case 'p':
  574. optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
  575. break;
  576. case 'l':
  577. optm |= ONIG_OPTION_FIND_LONGEST;
  578. break;
  579. case 'n':
  580. optm |= ONIG_OPTION_FIND_NOT_EMPTY;
  581. break;
  582. case 'j':
  583. *syntax = ONIG_SYNTAX_JAVA;
  584. break;
  585. case 'u':
  586. *syntax = ONIG_SYNTAX_GNU_REGEX;
  587. break;
  588. case 'g':
  589. *syntax = ONIG_SYNTAX_GREP;
  590. break;
  591. case 'c':
  592. *syntax = ONIG_SYNTAX_EMACS;
  593. break;
  594. case 'r':
  595. *syntax = ONIG_SYNTAX_RUBY;
  596. break;
  597. case 'z':
  598. *syntax = ONIG_SYNTAX_PERL;
  599. break;
  600. case 'b':
  601. *syntax = ONIG_SYNTAX_POSIX_BASIC;
  602. break;
  603. case 'd':
  604. *syntax = ONIG_SYNTAX_POSIX_EXTENDED;
  605. break;
  606. case 'e':
  607. if (eval != NULL) *eval = 1;
  608. break;
  609. default:
  610. break;
  611. }
  612. }
  613. if (option != NULL) *option|=optm;
  614. }
  615. }
  616. /* }}} */
  617. /*
  618. * Callbacks for named subpatterns
  619. */
  620. /* {{{ struct mb_ereg_groups_iter_arg */
  621. typedef struct mb_regex_groups_iter_args {
  622. zval *groups;
  623. char *search_str;
  624. size_t search_len;
  625. OnigRegion *region;
  626. } mb_regex_groups_iter_args;
  627. /* }}} */
  628. /* {{{ mb_ereg_groups_iter */
  629. static int
  630. mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
  631. {
  632. mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
  633. int gn, beg, end;
  634. /*
  635. * In case of duplicate groups, keep only the last succeeding one
  636. * to be consistent with preg_match with the PCRE_DUPNAMES option.
  637. */
  638. gn = onig_name_to_backref_number(reg, name, name_end, args->region);
  639. beg = args->region->beg[gn];
  640. end = args->region->end[gn];
  641. if (beg >= 0 && beg < end && end <= args->search_len) {
  642. add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
  643. } else {
  644. add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
  645. }
  646. return 0;
  647. }
  648. /* }}} */
  649. /*
  650. * Helper for _php_mb_regex_ereg_replace_exec
  651. */
  652. /* {{{ mb_regex_substitute */
  653. static inline void mb_regex_substitute(
  654. smart_str *pbuf,
  655. const char *subject,
  656. size_t subject_len,
  657. char *replace,
  658. size_t replace_len,
  659. php_mb_regex_t *regexp,
  660. OnigRegion *regs,
  661. const mbfl_encoding *enc
  662. ) {
  663. char *p, *sp, *eos;
  664. int no; /* bakreference group number */
  665. int clen; /* byte-length of the current character */
  666. p = replace;
  667. eos = replace + replace_len;
  668. while (p < eos) {
  669. clen = (int) php_mb_mbchar_bytes_ex(p, enc);
  670. if (clen != 1 || p == eos || p[0] != '\\') {
  671. /* skip anything that's not an ascii backslash */
  672. smart_str_appendl(pbuf, p, clen);
  673. p += clen;
  674. continue;
  675. }
  676. sp = p; /* save position */
  677. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  678. if (clen != 1 || p == eos) {
  679. /* skip backslash followed by multibyte char */
  680. smart_str_appendl(pbuf, sp, p - sp);
  681. continue;
  682. }
  683. no = -1;
  684. switch (p[0]) {
  685. case '0':
  686. no = 0;
  687. p++;
  688. break;
  689. case '1': case '2': case '3': case '4':
  690. case '5': case '6': case '7': case '8': case '9':
  691. if (!onig_noname_group_capture_is_active(regexp)) {
  692. /*
  693. * FIXME:
  694. * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
  695. * For now we just ignore them, but in the future we might want to raise a warning
  696. * and abort the whole replace operation.
  697. */
  698. p++;
  699. smart_str_appendl(pbuf, sp, p - sp);
  700. continue;
  701. }
  702. no = p[0] - '0';
  703. p++;
  704. break;
  705. case 'k':
  706. {
  707. clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
  708. if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
  709. /* not a backref delimiter */
  710. p += clen;
  711. smart_str_appendl(pbuf, sp, p - sp);
  712. continue;
  713. }
  714. /* try to consume everything until next delimiter */
  715. char delim = p[0] == '<' ? '>' : '\'';
  716. char *name, *name_end;
  717. char maybe_num = 1;
  718. name_end = name = p + 1;
  719. while (name_end < eos) {
  720. clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
  721. if (clen != 1) {
  722. name_end += clen;
  723. maybe_num = 0;
  724. continue;
  725. }
  726. if (name_end[0] == delim) break;
  727. if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
  728. name_end++;
  729. }
  730. p = name_end + 1;
  731. if (name_end - name < 1 || name_end >= eos) {
  732. /* the backref was empty or we failed to find the end delimiter */
  733. smart_str_appendl(pbuf, sp, p - sp);
  734. continue;
  735. }
  736. /* we have either a name or a number */
  737. if (maybe_num) {
  738. if (!onig_noname_group_capture_is_active(regexp)) {
  739. /* see above note on mixing numbered & named backrefs */
  740. smart_str_appendl(pbuf, sp, p - sp);
  741. continue;
  742. }
  743. if (name_end - name == 1) {
  744. no = name[0] - '0';
  745. break;
  746. }
  747. if (name[0] == '0') {
  748. /* 01 is not a valid number */
  749. break;
  750. }
  751. no = (int) strtoul(name, NULL, 10);
  752. break;
  753. }
  754. no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
  755. break;
  756. }
  757. default:
  758. /* We're not treating \ as an escape character and will interpret something like
  759. * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
  760. * function has not supported escaping of backslashes historically. */
  761. smart_str_appendl(pbuf, sp, p - sp);
  762. continue;
  763. }
  764. if (no < 0 || no >= regs->num_regs) {
  765. /* invalid group number reference, keep the escape sequence in the output */
  766. smart_str_appendl(pbuf, sp, p - sp);
  767. continue;
  768. }
  769. if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
  770. smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
  771. }
  772. }
  773. if (p < eos) {
  774. smart_str_appendl(pbuf, p, eos - p);
  775. }
  776. }
  777. /* }}} */
  778. /*
  779. * php functions
  780. */
  781. /* {{{ proto string mb_regex_encoding([string encoding])
  782. Returns the current encoding for regex as a string. */
  783. PHP_FUNCTION(mb_regex_encoding)
  784. {
  785. char *encoding = NULL;
  786. size_t encoding_len;
  787. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s", &encoding, &encoding_len) == FAILURE) {
  788. RETURN_THROWS();
  789. }
  790. if (!encoding) {
  791. const char *retval = php_mb_regex_get_mbctype();
  792. ZEND_ASSERT(retval != NULL);
  793. RETURN_STRING(retval);
  794. } else {
  795. if (php_mb_regex_set_mbctype(encoding) == FAILURE) {
  796. zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", encoding);
  797. RETURN_THROWS();
  798. }
  799. /* TODO Make function return previous encoding? */
  800. RETURN_TRUE;
  801. }
  802. }
  803. /* }}} */
  804. /* {{{ _php_mb_onig_search */
  805. static int _php_mb_onig_search(regex_t* reg, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start,
  806. const OnigUChar* range, OnigRegion* region, OnigOptionType option) {
  807. OnigMatchParam *mp = onig_new_match_param();
  808. int err;
  809. onig_initialize_match_param(mp);
  810. if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_stack_limit))) {
  811. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  812. }
  813. if (!ZEND_LONG_UINT_OVFL(MBSTRG(regex_retry_limit))) {
  814. onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
  815. }
  816. /* search */
  817. err = onig_search_with_param(reg, str, end, start, range, region, option, mp);
  818. onig_free_match_param(mp);
  819. return err;
  820. }
  821. /* }}} */
  822. /* {{{ _php_mb_regex_ereg_exec */
  823. static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
  824. {
  825. zval *array = NULL;
  826. char *arg_pattern, *string;
  827. size_t arg_pattern_len, string_len;
  828. php_mb_regex_t *re;
  829. OnigRegion *regs = NULL;
  830. int i, match_len, beg, end;
  831. OnigOptionType options;
  832. char *str;
  833. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|z", &arg_pattern, &arg_pattern_len, &string, &string_len, &array) == FAILURE) {
  834. RETURN_THROWS();
  835. }
  836. if (array != NULL) {
  837. array = zend_try_array_init(array);
  838. if (!array) {
  839. RETURN_THROWS();
  840. }
  841. }
  842. if (!php_mb_check_encoding(
  843. string,
  844. string_len,
  845. php_mb_regex_get_mbctype_encoding()
  846. )) {
  847. RETURN_FALSE;
  848. }
  849. options = MBREX(regex_default_options);
  850. if (icase) {
  851. options |= ONIG_OPTION_IGNORECASE;
  852. }
  853. if (arg_pattern_len == 0) {
  854. php_error_docref(NULL, E_WARNING, "Empty pattern");
  855. RETVAL_FALSE;
  856. goto out;
  857. }
  858. re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(regex_default_syntax));
  859. if (re == NULL) {
  860. RETVAL_FALSE;
  861. goto out;
  862. }
  863. regs = onig_region_new();
  864. /* actually execute the regular expression */
  865. if (_php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) {
  866. RETVAL_FALSE;
  867. goto out;
  868. }
  869. match_len = 1;
  870. str = string;
  871. if (array != NULL) {
  872. match_len = regs->end[0] - regs->beg[0];
  873. for (i = 0; i < regs->num_regs; i++) {
  874. beg = regs->beg[i];
  875. end = regs->end[i];
  876. if (beg >= 0 && beg < end && (size_t)end <= string_len) {
  877. add_index_stringl(array, i, (char *)&str[beg], end - beg);
  878. } else {
  879. add_index_bool(array, i, 0);
  880. }
  881. }
  882. if (onig_number_of_names(re) > 0) {
  883. mb_regex_groups_iter_args args = {array, string, string_len, regs};
  884. onig_foreach_name(re, mb_regex_groups_iter, &args);
  885. }
  886. }
  887. if (match_len == 0) {
  888. match_len = 1;
  889. }
  890. RETVAL_LONG(match_len);
  891. out:
  892. if (regs != NULL) {
  893. onig_region_free(regs, 1);
  894. }
  895. }
  896. /* }}} */
  897. /* {{{ proto int mb_ereg(string pattern, string string [, array registers])
  898. Regular expression match for multibyte string */
  899. PHP_FUNCTION(mb_ereg)
  900. {
  901. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  902. }
  903. /* }}} */
  904. /* {{{ proto int mb_eregi(string pattern, string string [, array registers])
  905. Case-insensitive regular expression match for multibyte string */
  906. PHP_FUNCTION(mb_eregi)
  907. {
  908. _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  909. }
  910. /* }}} */
  911. /* {{{ _php_mb_regex_ereg_replace_exec */
  912. static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable)
  913. {
  914. char *arg_pattern;
  915. size_t arg_pattern_len;
  916. char *replace;
  917. size_t replace_len;
  918. zend_fcall_info arg_replace_fci;
  919. zend_fcall_info_cache arg_replace_fci_cache;
  920. char *string;
  921. size_t string_len;
  922. php_mb_regex_t *re;
  923. OnigSyntaxType *syntax;
  924. OnigRegion *regs = NULL;
  925. smart_str out_buf = {0};
  926. smart_str eval_buf = {0};
  927. smart_str *pbuf;
  928. int err, eval, n;
  929. OnigUChar *pos;
  930. OnigUChar *string_lim;
  931. char *description = NULL;
  932. const mbfl_encoding *enc = php_mb_regex_get_mbctype_encoding();
  933. ZEND_ASSERT(enc != NULL);
  934. eval = 0;
  935. {
  936. char *option_str = NULL;
  937. size_t option_str_len = 0;
  938. if (!is_callable) {
  939. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sss|s",
  940. &arg_pattern, &arg_pattern_len,
  941. &replace, &replace_len,
  942. &string, &string_len,
  943. &option_str, &option_str_len) == FAILURE) {
  944. RETURN_THROWS();
  945. }
  946. } else {
  947. if (zend_parse_parameters(ZEND_NUM_ARGS(), "sfs|s",
  948. &arg_pattern, &arg_pattern_len,
  949. &arg_replace_fci, &arg_replace_fci_cache,
  950. &string, &string_len,
  951. &option_str, &option_str_len) == FAILURE) {
  952. RETURN_THROWS();
  953. }
  954. }
  955. if (!php_mb_check_encoding(string, string_len, enc)) {
  956. RETURN_NULL();
  957. }
  958. if (option_str != NULL) {
  959. _php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval);
  960. } else {
  961. options |= MBREX(regex_default_options);
  962. syntax = MBREX(regex_default_syntax);
  963. }
  964. }
  965. if (eval) {
  966. if (is_callable) {
  967. php_error_docref(NULL, E_WARNING, "Option 'e' cannot be used with replacement callback");
  968. } else {
  969. php_error_docref(NULL, E_WARNING, "The 'e' option is no longer supported, use mb_ereg_replace_callback instead");
  970. }
  971. RETURN_FALSE;
  972. }
  973. /* create regex pattern buffer */
  974. re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, syntax);
  975. if (re == NULL) {
  976. RETURN_FALSE;
  977. }
  978. if (is_callable) {
  979. pbuf = &eval_buf;
  980. description = zend_make_compiled_string_description("mbregex replace");
  981. } else {
  982. pbuf = &out_buf;
  983. description = NULL;
  984. }
  985. /* do the actual work */
  986. err = 0;
  987. pos = (OnigUChar *)string;
  988. string_lim = (OnigUChar*)(string + string_len);
  989. regs = onig_region_new();
  990. while (err >= 0) {
  991. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0);
  992. if (err <= -2) {
  993. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  994. onig_error_code_to_str(err_str, err);
  995. php_error_docref(NULL, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str);
  996. break;
  997. }
  998. if (err >= 0) {
  999. /* copy the part of the string before the match */
  1000. smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
  1001. if (!is_callable) {
  1002. mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
  1003. }
  1004. if (is_callable) {
  1005. zval args[1];
  1006. zval subpats, retval;
  1007. int i;
  1008. array_init(&subpats);
  1009. for (i = 0; i < regs->num_regs; i++) {
  1010. add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
  1011. }
  1012. if (onig_number_of_names(re) > 0) {
  1013. mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
  1014. onig_foreach_name(re, mb_regex_groups_iter, &args);
  1015. }
  1016. ZVAL_COPY_VALUE(&args[0], &subpats);
  1017. /* null terminate buffer */
  1018. smart_str_0(&eval_buf);
  1019. arg_replace_fci.param_count = 1;
  1020. arg_replace_fci.params = args;
  1021. arg_replace_fci.retval = &retval;
  1022. if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache) == SUCCESS &&
  1023. !Z_ISUNDEF(retval)) {
  1024. convert_to_string_ex(&retval);
  1025. smart_str_appendl(&out_buf, Z_STRVAL(retval), Z_STRLEN(retval));
  1026. smart_str_free(&eval_buf);
  1027. zval_ptr_dtor(&retval);
  1028. } else {
  1029. if (!EG(exception)) {
  1030. php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
  1031. }
  1032. }
  1033. zval_ptr_dtor(&subpats);
  1034. }
  1035. n = regs->end[0];
  1036. if ((pos - (OnigUChar *)string) < n) {
  1037. pos = (OnigUChar *)string + n;
  1038. } else {
  1039. if (pos < string_lim) {
  1040. smart_str_appendl(&out_buf, (char *)pos, 1);
  1041. }
  1042. pos++;
  1043. }
  1044. } else { /* nomatch */
  1045. /* stick that last bit of string on our output */
  1046. if (string_lim - pos > 0) {
  1047. smart_str_appendl(&out_buf, (char *)pos, string_lim - pos);
  1048. }
  1049. }
  1050. onig_region_free(regs, 0);
  1051. }
  1052. if (description) {
  1053. efree(description);
  1054. }
  1055. if (regs != NULL) {
  1056. onig_region_free(regs, 1);
  1057. }
  1058. smart_str_free(&eval_buf);
  1059. if (err <= -2) {
  1060. smart_str_free(&out_buf);
  1061. RETVAL_FALSE;
  1062. } else if (out_buf.s) {
  1063. smart_str_0(&out_buf);
  1064. RETVAL_STR(out_buf.s);
  1065. } else {
  1066. RETVAL_EMPTY_STRING();
  1067. }
  1068. }
  1069. /* }}} */
  1070. /* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option])
  1071. Replace regular expression for multibyte string */
  1072. PHP_FUNCTION(mb_ereg_replace)
  1073. {
  1074. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
  1075. }
  1076. /* }}} */
  1077. /* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string)
  1078. Case insensitive replace regular expression for multibyte string */
  1079. PHP_FUNCTION(mb_eregi_replace)
  1080. {
  1081. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0);
  1082. }
  1083. /* }}} */
  1084. /* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option])
  1085. regular expression for multibyte string using replacement callback */
  1086. PHP_FUNCTION(mb_ereg_replace_callback)
  1087. {
  1088. _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
  1089. }
  1090. /* }}} */
  1091. /* {{{ proto array mb_split(string pattern, string string [, int limit])
  1092. split multibyte string into array by regular expression */
  1093. PHP_FUNCTION(mb_split)
  1094. {
  1095. char *arg_pattern;
  1096. size_t arg_pattern_len;
  1097. php_mb_regex_t *re;
  1098. OnigRegion *regs = NULL;
  1099. char *string;
  1100. OnigUChar *pos, *chunk_pos;
  1101. size_t string_len;
  1102. int err;
  1103. zend_long count = -1;
  1104. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
  1105. RETURN_THROWS();
  1106. }
  1107. if (count > 0) {
  1108. count--;
  1109. }
  1110. if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
  1111. RETURN_FALSE;
  1112. }
  1113. /* create regex pattern buffer */
  1114. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(regex_default_syntax))) == NULL) {
  1115. RETURN_FALSE;
  1116. }
  1117. array_init(return_value);
  1118. chunk_pos = pos = (OnigUChar *)string;
  1119. err = 0;
  1120. regs = onig_region_new();
  1121. /* churn through str, generating array entries as we go */
  1122. while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
  1123. size_t beg, end;
  1124. err = _php_mb_onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
  1125. if (err < 0) {
  1126. break;
  1127. }
  1128. beg = regs->beg[0], end = regs->end[0];
  1129. /* add it to the array */
  1130. if ((size_t)(pos - (OnigUChar *)string) < end) {
  1131. if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
  1132. add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
  1133. --count;
  1134. } else {
  1135. err = -2;
  1136. break;
  1137. }
  1138. /* point at our new starting point */
  1139. chunk_pos = pos = (OnigUChar *)string + end;
  1140. } else {
  1141. pos++;
  1142. }
  1143. onig_region_free(regs, 0);
  1144. }
  1145. onig_region_free(regs, 1);
  1146. /* see if we encountered an error */
  1147. if (err <= -2) {
  1148. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1149. onig_error_code_to_str(err_str, err);
  1150. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
  1151. zend_array_destroy(Z_ARR_P(return_value));
  1152. RETURN_FALSE;
  1153. }
  1154. /* otherwise we just have one last element to add to the array */
  1155. if ((OnigUChar *)(string + string_len) > chunk_pos) {
  1156. size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
  1157. add_next_index_stringl(return_value, (char *)chunk_pos, n);
  1158. } else {
  1159. add_next_index_stringl(return_value, "", 0);
  1160. }
  1161. }
  1162. /* }}} */
  1163. /* {{{ proto bool mb_ereg_match(string pattern, string string [,string option])
  1164. Regular expression match for multibyte string */
  1165. PHP_FUNCTION(mb_ereg_match)
  1166. {
  1167. char *arg_pattern;
  1168. size_t arg_pattern_len;
  1169. char *string;
  1170. size_t string_len;
  1171. php_mb_regex_t *re;
  1172. OnigSyntaxType *syntax;
  1173. OnigOptionType option = 0;
  1174. int err;
  1175. OnigMatchParam *mp;
  1176. {
  1177. char *option_str = NULL;
  1178. size_t option_str_len = 0;
  1179. if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|s",
  1180. &arg_pattern, &arg_pattern_len, &string, &string_len,
  1181. &option_str, &option_str_len)==FAILURE) {
  1182. RETURN_THROWS();
  1183. }
  1184. if (option_str != NULL) {
  1185. _php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL);
  1186. } else {
  1187. option |= MBREX(regex_default_options);
  1188. syntax = MBREX(regex_default_syntax);
  1189. }
  1190. }
  1191. if (!php_mb_check_encoding(string, string_len, php_mb_regex_get_mbctype_encoding())) {
  1192. RETURN_FALSE;
  1193. }
  1194. if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
  1195. RETURN_FALSE;
  1196. }
  1197. mp = onig_new_match_param();
  1198. onig_initialize_match_param(mp);
  1199. if (MBSTRG(regex_stack_limit) > 0 && MBSTRG(regex_stack_limit) < UINT_MAX) {
  1200. onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int)MBSTRG(regex_stack_limit));
  1201. }
  1202. if (MBSTRG(regex_retry_limit) > 0 && MBSTRG(regex_retry_limit) < UINT_MAX) {
  1203. onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int)MBSTRG(regex_retry_limit));
  1204. }
  1205. /* match */
  1206. err = onig_match_with_param(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0, mp);
  1207. onig_free_match_param(mp);
  1208. if (err >= 0) {
  1209. RETVAL_TRUE;
  1210. } else {
  1211. RETVAL_FALSE;
  1212. }
  1213. }
  1214. /* }}} */
  1215. /* regex search */
  1216. /* {{{ _php_mb_regex_ereg_search_exec */
  1217. static void
  1218. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
  1219. {
  1220. char *arg_pattern = NULL, *arg_options = NULL;
  1221. size_t arg_pattern_len, arg_options_len;
  1222. int err;
  1223. size_t n, i, pos, len;
  1224. /* Stored as int* in the OnigRegion struct */
  1225. int beg, end;
  1226. OnigOptionType option;
  1227. OnigUChar *str;
  1228. OnigSyntaxType *syntax;
  1229. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1230. RETURN_THROWS();
  1231. }
  1232. option = MBREX(regex_default_options);
  1233. if (arg_options) {
  1234. option = 0;
  1235. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
  1236. }
  1237. if (MBREX(search_regs)) {
  1238. onig_region_free(MBREX(search_regs), 1);
  1239. MBREX(search_regs) = NULL;
  1240. }
  1241. if (arg_pattern) {
  1242. /* create regex pattern buffer */
  1243. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(regex_default_syntax))) == NULL) {
  1244. RETURN_FALSE;
  1245. }
  1246. }
  1247. pos = MBREX(search_pos);
  1248. str = NULL;
  1249. len = 0;
  1250. if (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING){
  1251. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1252. len = Z_STRLEN(MBREX(search_str));
  1253. }
  1254. if (MBREX(search_re) == NULL) {
  1255. php_error_docref(NULL, E_WARNING, "No regex given");
  1256. RETURN_FALSE;
  1257. }
  1258. if (str == NULL) {
  1259. php_error_docref(NULL, E_WARNING, "No string given");
  1260. RETURN_FALSE;
  1261. }
  1262. MBREX(search_regs) = onig_region_new();
  1263. err = _php_mb_onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0);
  1264. if (err == ONIG_MISMATCH) {
  1265. MBREX(search_pos) = len;
  1266. RETVAL_FALSE;
  1267. } else if (err <= -2) {
  1268. OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
  1269. onig_error_code_to_str(err_str, err);
  1270. php_error_docref(NULL, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str);
  1271. RETVAL_FALSE;
  1272. } else {
  1273. switch (mode) {
  1274. case 1:
  1275. array_init(return_value);
  1276. beg = MBREX(search_regs)->beg[0];
  1277. end = MBREX(search_regs)->end[0];
  1278. add_next_index_long(return_value, beg);
  1279. add_next_index_long(return_value, end - beg);
  1280. break;
  1281. case 2:
  1282. array_init(return_value);
  1283. n = MBREX(search_regs)->num_regs;
  1284. for (i = 0; i < n; i++) {
  1285. beg = MBREX(search_regs)->beg[i];
  1286. end = MBREX(search_regs)->end[i];
  1287. if (beg >= 0 && beg <= end && end <= len) {
  1288. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1289. } else {
  1290. add_index_bool(return_value, i, 0);
  1291. }
  1292. }
  1293. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1294. mb_regex_groups_iter_args args = {
  1295. return_value,
  1296. Z_STRVAL(MBREX(search_str)),
  1297. Z_STRLEN(MBREX(search_str)),
  1298. MBREX(search_regs)
  1299. };
  1300. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1301. }
  1302. break;
  1303. default:
  1304. RETVAL_TRUE;
  1305. break;
  1306. }
  1307. end = MBREX(search_regs)->end[0];
  1308. if (pos <= end) {
  1309. MBREX(search_pos) = end;
  1310. } else {
  1311. MBREX(search_pos) = pos + 1;
  1312. }
  1313. }
  1314. if (err < 0) {
  1315. onig_region_free(MBREX(search_regs), 1);
  1316. MBREX(search_regs) = (OnigRegion *)NULL;
  1317. }
  1318. }
  1319. /* }}} */
  1320. /* {{{ proto bool mb_ereg_search([string pattern[, string option]])
  1321. Regular expression search for multibyte string */
  1322. PHP_FUNCTION(mb_ereg_search)
  1323. {
  1324. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  1325. }
  1326. /* }}} */
  1327. /* {{{ proto array mb_ereg_search_pos([string pattern[, string option]])
  1328. Regular expression search for multibyte string */
  1329. PHP_FUNCTION(mb_ereg_search_pos)
  1330. {
  1331. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  1332. }
  1333. /* }}} */
  1334. /* {{{ proto array mb_ereg_search_regs([string pattern[, string option]])
  1335. Regular expression search for multibyte string */
  1336. PHP_FUNCTION(mb_ereg_search_regs)
  1337. {
  1338. _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2);
  1339. }
  1340. /* }}} */
  1341. /* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]])
  1342. Initialize string and regular expression for search. */
  1343. PHP_FUNCTION(mb_ereg_search_init)
  1344. {
  1345. zend_string *arg_str;
  1346. char *arg_pattern = NULL, *arg_options = NULL;
  1347. size_t arg_pattern_len = 0, arg_options_len = 0;
  1348. OnigSyntaxType *syntax = NULL;
  1349. OnigOptionType option;
  1350. if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) {
  1351. RETURN_THROWS();
  1352. }
  1353. if (ZEND_NUM_ARGS() > 1 && arg_pattern_len == 0) {
  1354. php_error_docref(NULL, E_WARNING, "Empty pattern");
  1355. RETURN_FALSE;
  1356. }
  1357. option = MBREX(regex_default_options);
  1358. syntax = MBREX(regex_default_syntax);
  1359. if (ZEND_NUM_ARGS() == 3) {
  1360. option = 0;
  1361. _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL);
  1362. }
  1363. if (ZEND_NUM_ARGS() > 1) {
  1364. /* create regex pattern buffer */
  1365. if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, syntax)) == NULL) {
  1366. RETURN_FALSE;
  1367. }
  1368. }
  1369. if (!Z_ISNULL(MBREX(search_str))) {
  1370. zval_ptr_dtor(&MBREX(search_str));
  1371. }
  1372. ZVAL_STR_COPY(&MBREX(search_str), arg_str);
  1373. if (php_mb_check_encoding(
  1374. ZSTR_VAL(arg_str),
  1375. ZSTR_LEN(arg_str),
  1376. php_mb_regex_get_mbctype_encoding()
  1377. )) {
  1378. MBREX(search_pos) = 0;
  1379. RETVAL_TRUE;
  1380. } else {
  1381. MBREX(search_pos) = ZSTR_LEN(arg_str);
  1382. RETVAL_FALSE;
  1383. }
  1384. if (MBREX(search_regs) != NULL) {
  1385. onig_region_free(MBREX(search_regs), 1);
  1386. MBREX(search_regs) = NULL;
  1387. }
  1388. }
  1389. /* }}} */
  1390. /* {{{ proto array mb_ereg_search_getregs(void)
  1391. Get matched substring of the last time */
  1392. PHP_FUNCTION(mb_ereg_search_getregs)
  1393. {
  1394. size_t n, i, len;
  1395. /* Stored as int* in the OnigRegion struct */
  1396. int beg, end;
  1397. OnigUChar *str;
  1398. if (zend_parse_parameters_none() == FAILURE) {
  1399. RETURN_THROWS();
  1400. }
  1401. if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
  1402. array_init(return_value);
  1403. str = (OnigUChar *)Z_STRVAL(MBREX(search_str));
  1404. len = Z_STRLEN(MBREX(search_str));
  1405. n = MBREX(search_regs)->num_regs;
  1406. for (i = 0; i < n; i++) {
  1407. beg = MBREX(search_regs)->beg[i];
  1408. end = MBREX(search_regs)->end[i];
  1409. if (beg >= 0 && beg <= end && end <= len) {
  1410. add_index_stringl(return_value, i, (char *)&str[beg], end - beg);
  1411. } else {
  1412. add_index_bool(return_value, i, 0);
  1413. }
  1414. }
  1415. if (onig_number_of_names(MBREX(search_re)) > 0) {
  1416. mb_regex_groups_iter_args args = {
  1417. return_value,
  1418. Z_STRVAL(MBREX(search_str)),
  1419. len,
  1420. MBREX(search_regs)
  1421. };
  1422. onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
  1423. }
  1424. } else {
  1425. RETVAL_FALSE;
  1426. }
  1427. }
  1428. /* }}} */
  1429. /* {{{ proto int mb_ereg_search_getpos(void)
  1430. Get search start position */
  1431. PHP_FUNCTION(mb_ereg_search_getpos)
  1432. {
  1433. if (zend_parse_parameters_none() == FAILURE) {
  1434. RETURN_THROWS();
  1435. }
  1436. RETVAL_LONG(MBREX(search_pos));
  1437. }
  1438. /* }}} */
  1439. /* {{{ proto bool mb_ereg_search_setpos(int position)
  1440. Set search start position */
  1441. PHP_FUNCTION(mb_ereg_search_setpos)
  1442. {
  1443. zend_long position;
  1444. if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &position) == FAILURE) {
  1445. RETURN_THROWS();
  1446. }
  1447. /* Accept negative position if length of search string can be determined */
  1448. if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
  1449. position += Z_STRLEN(MBREX(search_str));
  1450. }
  1451. if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
  1452. php_error_docref(NULL, E_WARNING, "Position is out of range");
  1453. MBREX(search_pos) = 0;
  1454. RETURN_FALSE;
  1455. }
  1456. MBREX(search_pos) = position;
  1457. RETURN_TRUE;
  1458. }
  1459. /* }}} */
  1460. /* {{{ php_mb_regex_set_options */
  1461. static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax)
  1462. {
  1463. if (prev_options != NULL) {
  1464. *prev_options = MBREX(regex_default_options);
  1465. }
  1466. if (prev_syntax != NULL) {
  1467. *prev_syntax = MBREX(regex_default_syntax);
  1468. }
  1469. MBREX(regex_default_options) = options;
  1470. MBREX(regex_default_syntax) = syntax;
  1471. }
  1472. /* }}} */
  1473. /* {{{ proto string mb_regex_set_options([string options])
  1474. Set or get the default options for mbregex functions */
  1475. PHP_FUNCTION(mb_regex_set_options)
  1476. {
  1477. OnigOptionType opt, prev_opt;
  1478. OnigSyntaxType *syntax, *prev_syntax;
  1479. char *string = NULL;
  1480. size_t string_len;
  1481. char buf[16];
  1482. if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s",
  1483. &string, &string_len) == FAILURE) {
  1484. RETURN_THROWS();
  1485. }
  1486. if (string != NULL) {
  1487. opt = 0;
  1488. syntax = NULL;
  1489. _php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL);
  1490. _php_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax);
  1491. opt = prev_opt;
  1492. syntax = prev_syntax;
  1493. } else {
  1494. opt = MBREX(regex_default_options);
  1495. syntax = MBREX(regex_default_syntax);
  1496. }
  1497. _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
  1498. RETVAL_STRING(buf);
  1499. }
  1500. /* }}} */
  1501. #endif /* HAVE_MBREGEX */