PageRenderTime 49ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/php5/ext/intl/grapheme/grapheme_string.c

http://github.com/vpj/PHP-Extension-API
C | 914 lines | 540 code | 246 blank | 128 comment | 126 complexity | 80c8db1257c49d0e18efd903aaa5886c MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1, BSD-3-Clause
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Ed Batutis <ed@batutis.com> |
  14. +----------------------------------------------------------------------+
  15. */
  16. /* {{{ includes */
  17. #ifdef HAVE_CONFIG_H
  18. #include "config.h"
  19. #endif
  20. #include <php.h>
  21. #include "grapheme.h"
  22. #include "grapheme_util.h"
  23. #include <unicode/utypes.h>
  24. #include <unicode/ucol.h>
  25. #include <unicode/ustring.h>
  26. #include <unicode/ubrk.h>
  27. #include "ext/standard/php_string.h"
  28. /* }}} */
  29. #define GRAPHEME_EXTRACT_TYPE_COUNT 0
  30. #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
  31. #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
  32. #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
  33. #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
  34. /* {{{ grapheme_register_constants
  35. * Register API constants
  36. */
  37. void grapheme_register_constants( INIT_FUNC_ARGS )
  38. {
  39. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  40. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  41. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  42. }
  43. /* }}} */
  44. /* {{{ proto int grapheme_strlen(string str)
  45. Get number of graphemes in a string */
  46. PHP_FUNCTION(grapheme_strlen)
  47. {
  48. unsigned char* string;
  49. int string_len;
  50. UChar* ustring = NULL;
  51. int ustring_len = 0;
  52. int ret_len;
  53. UErrorCode status;
  54. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
  55. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  56. "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
  57. RETURN_FALSE;
  58. }
  59. ret_len = grapheme_ascii_check(string, string_len);
  60. if ( ret_len >= 0 )
  61. RETURN_LONG(ret_len);
  62. /* convert the string to UTF-16. */
  63. status = U_ZERO_ERROR;
  64. intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
  65. if ( U_FAILURE( status ) ) {
  66. /* Set global error code. */
  67. intl_error_set_code( NULL, status TSRMLS_CC );
  68. /* Set error messages. */
  69. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
  70. efree( ustring );
  71. RETURN_NULL();
  72. }
  73. ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
  74. efree( ustring );
  75. if (ret_len >= 0) {
  76. RETVAL_LONG(ret_len);
  77. } else {
  78. RETVAL_FALSE;
  79. }
  80. }
  81. /* }}} */
  82. /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
  83. Find position of first occurrence of a string within another */
  84. PHP_FUNCTION(grapheme_strpos)
  85. {
  86. unsigned char *haystack, *needle;
  87. int haystack_len, needle_len;
  88. unsigned char *found;
  89. long loffset = 0;
  90. int32_t offset = 0;
  91. int ret_pos, uchar_pos;
  92. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  93. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  94. "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
  95. RETURN_FALSE;
  96. }
  97. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  98. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  99. RETURN_FALSE;
  100. }
  101. /* we checked that it will fit: */
  102. offset = (int32_t) loffset;
  103. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  104. if (needle_len == 0) {
  105. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  106. RETURN_FALSE;
  107. }
  108. /* quick check to see if the string might be there
  109. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  110. */
  111. found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
  112. /* if it isn't there the we are done */
  113. if (!found) {
  114. RETURN_FALSE;
  115. }
  116. /* if it is there, and if the haystack is ascii, we are all done */
  117. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  118. RETURN_LONG(found - haystack);
  119. }
  120. /* do utf16 part of the strpos */
  121. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
  122. if ( ret_pos >= 0 ) {
  123. RETURN_LONG(ret_pos + offset);
  124. } else {
  125. RETURN_FALSE;
  126. }
  127. }
  128. /* }}} */
  129. /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
  130. Find position of first occurrence of a string within another, ignoring case differences */
  131. PHP_FUNCTION(grapheme_stripos)
  132. {
  133. unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
  134. int haystack_len, needle_len;
  135. unsigned char *found;
  136. long loffset = 0;
  137. int32_t offset = 0;
  138. int ret_pos, uchar_pos;
  139. int is_ascii;
  140. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  141. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  142. "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
  143. RETURN_FALSE;
  144. }
  145. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  146. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
  147. RETURN_FALSE;
  148. }
  149. /* we checked that it will fit: */
  150. offset = (int32_t) loffset;
  151. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  152. if (needle_len == 0) {
  153. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
  154. RETURN_FALSE;
  155. }
  156. is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
  157. if ( is_ascii ) {
  158. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  159. php_strtolower((char *)needle_dup, needle_len);
  160. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  161. php_strtolower((char *)haystack_dup, haystack_len);
  162. found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
  163. efree(haystack_dup);
  164. efree(needle_dup);
  165. if (found) {
  166. RETURN_LONG(found - haystack_dup);
  167. }
  168. /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
  169. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  170. RETURN_FALSE;
  171. }
  172. }
  173. /* do utf16 part of the strpos */
  174. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
  175. if ( ret_pos >= 0 ) {
  176. RETURN_LONG(ret_pos + offset);
  177. } else {
  178. RETURN_FALSE;
  179. }
  180. }
  181. /* }}} */
  182. /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
  183. Find position of last occurrence of a string within another */
  184. PHP_FUNCTION(grapheme_strrpos)
  185. {
  186. unsigned char *haystack, *needle;
  187. int haystack_len, needle_len;
  188. long loffset = 0;
  189. int32_t offset = 0;
  190. int32_t ret_pos;
  191. int is_ascii;
  192. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  193. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  194. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  195. RETURN_FALSE;
  196. }
  197. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  198. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  199. RETURN_FALSE;
  200. }
  201. /* we checked that it will fit: */
  202. offset = (int32_t) loffset;
  203. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  204. if (needle_len == 0) {
  205. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  206. RETURN_FALSE;
  207. }
  208. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  209. if ( is_ascii ) {
  210. ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
  211. if ( ret_pos >= 0 ) {
  212. RETURN_LONG(ret_pos);
  213. }
  214. /* if the needle was ascii too, we are done */
  215. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  216. RETURN_FALSE;
  217. }
  218. /* else we need to continue via utf16 */
  219. }
  220. ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
  221. if ( ret_pos >= 0 ) {
  222. RETURN_LONG(ret_pos);
  223. } else {
  224. RETURN_FALSE;
  225. }
  226. }
  227. /* }}} */
  228. /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
  229. Find position of last occurrence of a string within another, ignoring case */
  230. PHP_FUNCTION(grapheme_strripos)
  231. {
  232. unsigned char *haystack, *needle;
  233. int haystack_len, needle_len;
  234. long loffset = 0;
  235. int32_t offset = 0;
  236. int32_t ret_pos;
  237. int is_ascii;
  238. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  239. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  240. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  241. RETURN_FALSE;
  242. }
  243. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  244. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  245. RETURN_FALSE;
  246. }
  247. /* we checked that it will fit: */
  248. offset = (int32_t) loffset;
  249. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  250. if (needle_len == 0) {
  251. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  252. RETURN_FALSE;
  253. }
  254. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  255. if ( is_ascii ) {
  256. unsigned char *needle_dup, *haystack_dup;
  257. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  258. php_strtolower((char *)needle_dup, needle_len);
  259. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  260. php_strtolower((char *)haystack_dup, haystack_len);
  261. ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
  262. efree(haystack_dup);
  263. efree(needle_dup);
  264. if ( ret_pos >= 0 ) {
  265. RETURN_LONG(ret_pos);
  266. }
  267. /* if the needle was ascii too, we are done */
  268. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  269. RETURN_FALSE;
  270. }
  271. /* else we need to continue via utf16 */
  272. }
  273. ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
  274. if ( ret_pos >= 0 ) {
  275. RETURN_LONG(ret_pos);
  276. } else {
  277. RETURN_FALSE;
  278. }
  279. }
  280. /* }}} */
  281. /* {{{ proto string grapheme_substr(string str, int start [, int length])
  282. Returns part of a string */
  283. PHP_FUNCTION(grapheme_substr)
  284. {
  285. unsigned char *str, *sub_str;
  286. UChar *ustr;
  287. int str_len, sub_str_len, ustr_len;
  288. long lstart = 0, length = 0;
  289. int32_t start = 0;
  290. int iter_val;
  291. UErrorCode status;
  292. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  293. UBreakIterator* bi = NULL;
  294. int sub_str_start_pos, sub_str_end_pos;
  295. int32_t (*iter_func)(UBreakIterator *);
  296. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
  297. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  298. "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
  299. RETURN_FALSE;
  300. }
  301. if ( OUTSIDE_STRING(lstart, str_len) ) {
  302. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  303. RETURN_FALSE;
  304. }
  305. /* we checked that it will fit: */
  306. start = (int32_t) lstart;
  307. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  308. if ( grapheme_ascii_check(str, str_len) >= 0 ) {
  309. grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
  310. if ( NULL == sub_str ) {
  311. RETURN_FALSE;
  312. }
  313. RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
  314. }
  315. ustr = NULL;
  316. ustr_len = 0;
  317. status = U_ZERO_ERROR;
  318. intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
  319. if ( U_FAILURE( status ) ) {
  320. /* Set global error code. */
  321. intl_error_set_code( NULL, status TSRMLS_CC );
  322. /* Set error messages. */
  323. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
  324. efree( ustr );
  325. RETURN_FALSE;
  326. }
  327. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
  328. if( U_FAILURE(status) ) {
  329. RETURN_FALSE;
  330. }
  331. ubrk_setText(bi, ustr, ustr_len, &status);
  332. if ( start < 0 ) {
  333. iter_func = ubrk_previous;
  334. ubrk_last(bi);
  335. iter_val = 1;
  336. }
  337. else {
  338. iter_func = ubrk_next;
  339. iter_val = -1;
  340. }
  341. sub_str_start_pos = 0;
  342. while ( start ) {
  343. sub_str_start_pos = iter_func(bi);
  344. if ( UBRK_DONE == sub_str_start_pos ) {
  345. break;
  346. }
  347. start += iter_val;
  348. }
  349. if ( 0 != start || sub_str_start_pos >= ustr_len ) {
  350. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  351. efree(ustr);
  352. ubrk_close(bi);
  353. RETURN_FALSE;
  354. }
  355. if (ZEND_NUM_ARGS() <= 2) {
  356. /* no length supplied, return the rest of the string */
  357. sub_str = NULL;
  358. sub_str_len = 0;
  359. status = U_ZERO_ERROR;
  360. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
  361. efree( ustr );
  362. ubrk_close( bi );
  363. if ( U_FAILURE( status ) ) {
  364. /* Set global error code. */
  365. intl_error_set_code( NULL, status TSRMLS_CC );
  366. /* Set error messages. */
  367. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC );
  368. efree( sub_str );
  369. RETURN_FALSE;
  370. }
  371. /* return the allocated string, not a duplicate */
  372. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  373. }
  374. /* find the end point of the string to return */
  375. if ( length < 0 ) {
  376. iter_func = ubrk_previous;
  377. ubrk_last(bi);
  378. iter_val = 1;
  379. }
  380. else {
  381. iter_func = ubrk_next;
  382. iter_val = -1;
  383. }
  384. sub_str_end_pos = 0;
  385. while ( length ) {
  386. sub_str_end_pos = iter_func(bi);
  387. if ( UBRK_DONE == sub_str_end_pos ) {
  388. break;
  389. }
  390. length += iter_val;
  391. }
  392. if ( UBRK_DONE == sub_str_end_pos && length < 0) {
  393. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
  394. efree(ustr);
  395. ubrk_close(bi);
  396. RETURN_FALSE;
  397. }
  398. sub_str = NULL;
  399. status = U_ZERO_ERROR;
  400. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
  401. efree( ustr );
  402. ubrk_close( bi );
  403. if ( U_FAILURE( status ) ) {
  404. /* Set global error code. */
  405. intl_error_set_code( NULL, status TSRMLS_CC );
  406. /* Set error messages. */
  407. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC );
  408. if ( NULL != sub_str )
  409. efree( sub_str );
  410. RETURN_FALSE;
  411. }
  412. /* return the allocated string, not a duplicate */
  413. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  414. }
  415. /* }}} */
  416. /* {{{ strstr_common_handler */
  417. static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
  418. {
  419. unsigned char *haystack, *needle, *found;
  420. int haystack_len, needle_len;
  421. int ret_pos, uchar_pos;
  422. zend_bool part = 0;
  423. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
  424. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  425. "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
  426. RETURN_FALSE;
  427. }
  428. if (needle_len == 0) {
  429. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  430. RETURN_FALSE;
  431. }
  432. if ( !f_ignore_case ) {
  433. /* ASCII optimization: quick check to see if the string might be there
  434. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  435. */
  436. found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
  437. /* if it isn't there the we are done */
  438. if ( !found ) {
  439. RETURN_FALSE;
  440. }
  441. /* if it is there, and if the haystack is ascii, we are all done */
  442. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  443. size_t found_offset = found - haystack;
  444. if (part) {
  445. RETURN_STRINGL(((char *)haystack) , found_offset, 1);
  446. } else {
  447. RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
  448. }
  449. }
  450. }
  451. /* need to work in utf16 */
  452. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
  453. if ( ret_pos < 0 ) {
  454. RETURN_FALSE;
  455. }
  456. /* uchar_pos is the 'nth' Unicode character position of the needle */
  457. ret_pos = 0;
  458. U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
  459. if (part) {
  460. RETURN_STRINGL(((char *)haystack), ret_pos, 1);
  461. }
  462. else {
  463. RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
  464. }
  465. }
  466. /* }}} */
  467. /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
  468. Finds first occurrence of a string within another */
  469. PHP_FUNCTION(grapheme_strstr)
  470. {
  471. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
  472. }
  473. /* }}} */
  474. /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
  475. Finds first occurrence of a string within another */
  476. PHP_FUNCTION(grapheme_stristr)
  477. {
  478. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
  479. }
  480. /* }}} */
  481. /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
  482. inline int32_t
  483. grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
  484. {
  485. int pos = 0, prev_pos = 0;
  486. int ret_pos = 0, prev_ret_pos = 0;
  487. while ( 1 ) {
  488. pos = ubrk_next(bi);
  489. if ( UBRK_DONE == pos ) {
  490. break;
  491. }
  492. /* if we are beyond our limit, then the loop is done */
  493. if ( pos > csize ) {
  494. break;
  495. }
  496. /* update our pointer in the original UTF-8 buffer by as many characters
  497. as ubrk_next iterated over */
  498. prev_ret_pos = ret_pos;
  499. U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
  500. if ( prev_ret_pos == ret_pos ) {
  501. /* something wrong - malformed utf8? */
  502. break;
  503. }
  504. prev_pos = pos;
  505. }
  506. return ret_pos;
  507. }
  508. /* }}} */
  509. /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
  510. inline int32_t
  511. grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
  512. {
  513. int pos = 0, prev_pos = 0;
  514. int ret_pos = 0, prev_ret_pos = 0;
  515. while ( 1 ) {
  516. pos = ubrk_next(bi);
  517. if ( UBRK_DONE == pos ) {
  518. break;
  519. }
  520. prev_ret_pos = ret_pos;
  521. U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
  522. if ( ret_pos > bsize ) {
  523. ret_pos = prev_ret_pos;
  524. break;
  525. }
  526. if ( prev_ret_pos == ret_pos ) {
  527. /* something wrong - malformed utf8? */
  528. break;
  529. }
  530. prev_pos = pos;
  531. }
  532. return ret_pos;
  533. }
  534. /* }}} */
  535. /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
  536. inline int32_t
  537. grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
  538. {
  539. int pos = 0, next_pos = 0;
  540. int ret_pos = 0;
  541. while ( size ) {
  542. next_pos = ubrk_next(bi);
  543. if ( UBRK_DONE == next_pos ) {
  544. break;
  545. }
  546. pos = next_pos;
  547. size--;
  548. }
  549. /* pos is one past the last UChar - and represent the number of code units to
  550. advance in the utf-8 buffer
  551. */
  552. U8_FWD_N(pstr, ret_pos, str_len, pos);
  553. return ret_pos;
  554. }
  555. /* }}} */
  556. /* {{{ grapheme extract iter function pointer array */
  557. typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
  558. static grapheme_extract_iter grapheme_extract_iters[] = {
  559. &grapheme_extract_count_iter,
  560. &grapheme_extract_bytecount_iter,
  561. &grapheme_extract_charcount_iter,
  562. };
  563. /* }}} */
  564. /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
  565. Function to extract a sequence of default grapheme clusters */
  566. PHP_FUNCTION(grapheme_extract)
  567. {
  568. unsigned char *str, *pstr;
  569. UChar *ustr;
  570. int str_len, ustr_len;
  571. long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
  572. long lstart = 0; /* starting position in str in bytes */
  573. int32_t start = 0;
  574. long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
  575. UErrorCode status;
  576. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  577. UBreakIterator* bi = NULL;
  578. int ret_pos;
  579. zval *next = NULL; /* return offset of next part of the string */
  580. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
  581. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  582. "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
  583. RETURN_FALSE;
  584. }
  585. if ( NULL != next ) {
  586. if ( !PZVAL_IS_REF(next) ) {
  587. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  588. "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
  589. RETURN_FALSE;
  590. }
  591. else {
  592. /* initialize next */
  593. ZVAL_LONG(next, start);
  594. }
  595. }
  596. if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
  597. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  598. "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
  599. RETURN_FALSE;
  600. }
  601. if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
  602. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 1 TSRMLS_CC );
  603. RETURN_FALSE;
  604. }
  605. /* we checked that it will fit: */
  606. start = (int32_t) lstart;
  607. pstr = str + start;
  608. /* just in case pstr points in the middle of a character, move forward to the start of the next char */
  609. if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  610. unsigned char *str_end = str + str_len;
  611. while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  612. pstr++;
  613. if ( pstr >= str_end ) {
  614. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  615. "grapheme_extract: invalid input string", 0 TSRMLS_CC );
  616. RETURN_FALSE;
  617. }
  618. }
  619. }
  620. str_len -= (pstr - str);
  621. /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
  622. (size + 1 because the size-th character might be the beginning of a grapheme cluster)
  623. */
  624. if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
  625. long nsize = ( size < str_len ? size : str_len );
  626. if ( NULL != next ) {
  627. ZVAL_LONG(next, start+nsize);
  628. }
  629. RETURN_STRINGL(((char *)pstr), nsize, 1);
  630. }
  631. /* convert the strings to UTF-16. */
  632. ustr = NULL;
  633. ustr_len = 0;
  634. status = U_ZERO_ERROR;
  635. intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
  636. if ( U_FAILURE( status ) ) {
  637. /* Set global error code. */
  638. intl_error_set_code( NULL, status TSRMLS_CC );
  639. /* Set error messages. */
  640. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
  641. if ( NULL != ustr )
  642. efree( ustr );
  643. RETURN_FALSE;
  644. }
  645. bi = NULL;
  646. status = U_ZERO_ERROR;
  647. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
  648. ubrk_setText(bi, ustr, ustr_len, &status);
  649. /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
  650. can't back up. So, we will not do anything. */
  651. /* now we need to find the end of the chunk the user wants us to return */
  652. ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
  653. efree(ustr);
  654. ubrk_close(bi);
  655. if ( NULL != next ) {
  656. ZVAL_LONG(next, start+ret_pos);
  657. }
  658. RETURN_STRINGL(((char *)pstr), ret_pos, 1);
  659. }
  660. /* }}} */
  661. /*
  662. * Local variables:
  663. * tab-width: 4
  664. * c-basic-offset: 4
  665. * End:
  666. * vim600: fdm=marker
  667. * vim: noet sw=4 ts=4
  668. */