PageRenderTime 55ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/release/src/router/php/ext/intl/grapheme/grapheme_string.c

https://gitlab.com/envieidoc/tomato
C | 955 lines | 578 code | 248 blank | 129 comment | 142 complexity | 040eaabd11e42e5c72f8ed7077f9861f MD5 | raw file
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Ed Batutis <ed@batutis.com> |
  14. +----------------------------------------------------------------------+
  15. */
  16. /* {{{ includes */
  17. #ifdef HAVE_CONFIG_H
  18. #include "config.h"
  19. #endif
  20. #include <php.h>
  21. #include "grapheme.h"
  22. #include "grapheme_util.h"
  23. #include <unicode/utypes.h>
  24. #include <unicode/ucol.h>
  25. #include <unicode/ustring.h>
  26. #include <unicode/ubrk.h>
  27. #include "ext/standard/php_string.h"
  28. /* }}} */
  29. #define GRAPHEME_EXTRACT_TYPE_COUNT 0
  30. #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
  31. #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
  32. #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
  33. #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
  34. /* {{{ grapheme_register_constants
  35. * Register API constants
  36. */
  37. void grapheme_register_constants( INIT_FUNC_ARGS )
  38. {
  39. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  40. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  41. REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  42. }
  43. /* }}} */
  44. /* {{{ proto int grapheme_strlen(string str)
  45. Get number of graphemes in a string */
  46. PHP_FUNCTION(grapheme_strlen)
  47. {
  48. unsigned char* string;
  49. int string_len;
  50. UChar* ustring = NULL;
  51. int ustring_len = 0;
  52. int ret_len;
  53. UErrorCode status;
  54. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
  55. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  56. "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
  57. RETURN_FALSE;
  58. }
  59. ret_len = grapheme_ascii_check(string, string_len);
  60. if ( ret_len >= 0 )
  61. RETURN_LONG(ret_len);
  62. /* convert the string to UTF-16. */
  63. status = U_ZERO_ERROR;
  64. intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
  65. if ( U_FAILURE( status ) ) {
  66. /* Set global error code. */
  67. intl_error_set_code( NULL, status TSRMLS_CC );
  68. /* Set error messages. */
  69. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  70. if (ustring) {
  71. efree( ustring );
  72. }
  73. RETURN_NULL();
  74. }
  75. ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
  76. if (ustring) {
  77. efree( ustring );
  78. }
  79. if (ret_len >= 0) {
  80. RETVAL_LONG(ret_len);
  81. } else {
  82. RETVAL_FALSE;
  83. }
  84. }
  85. /* }}} */
  86. /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
  87. Find position of first occurrence of a string within another */
  88. PHP_FUNCTION(grapheme_strpos)
  89. {
  90. unsigned char *haystack, *needle;
  91. int haystack_len, needle_len;
  92. unsigned char *found;
  93. long loffset = 0;
  94. int32_t offset = 0;
  95. int ret_pos;
  96. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  97. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  98. "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
  99. RETURN_FALSE;
  100. }
  101. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  102. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  103. RETURN_FALSE;
  104. }
  105. /* we checked that it will fit: */
  106. offset = (int32_t) loffset;
  107. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  108. if (needle_len == 0) {
  109. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  110. RETURN_FALSE;
  111. }
  112. /* quick check to see if the string might be there
  113. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  114. */
  115. found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
  116. /* if it isn't there the we are done */
  117. if (!found) {
  118. RETURN_FALSE;
  119. }
  120. /* if it is there, and if the haystack is ascii, we are all done */
  121. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  122. RETURN_LONG(found - haystack);
  123. }
  124. /* do utf16 part of the strpos */
  125. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
  126. if ( ret_pos >= 0 ) {
  127. RETURN_LONG(ret_pos);
  128. } else {
  129. RETURN_FALSE;
  130. }
  131. }
  132. /* }}} */
  133. /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
  134. Find position of first occurrence of a string within another, ignoring case differences */
  135. PHP_FUNCTION(grapheme_stripos)
  136. {
  137. unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
  138. int haystack_len, needle_len;
  139. unsigned char *found;
  140. long loffset = 0;
  141. int32_t offset = 0;
  142. int ret_pos;
  143. int is_ascii;
  144. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  145. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  146. "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
  147. RETURN_FALSE;
  148. }
  149. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  150. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
  151. RETURN_FALSE;
  152. }
  153. /* we checked that it will fit: */
  154. offset = (int32_t) loffset;
  155. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  156. if (needle_len == 0) {
  157. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
  158. RETURN_FALSE;
  159. }
  160. is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
  161. if ( is_ascii ) {
  162. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  163. php_strtolower((char *)needle_dup, needle_len);
  164. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  165. php_strtolower((char *)haystack_dup, haystack_len);
  166. found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
  167. efree(haystack_dup);
  168. efree(needle_dup);
  169. if (found) {
  170. RETURN_LONG(found - haystack_dup);
  171. }
  172. /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
  173. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  174. RETURN_FALSE;
  175. }
  176. }
  177. /* do utf16 part of the strpos */
  178. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
  179. if ( ret_pos >= 0 ) {
  180. RETURN_LONG(ret_pos);
  181. } else {
  182. RETURN_FALSE;
  183. }
  184. }
  185. /* }}} */
  186. /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
  187. Find position of last occurrence of a string within another */
  188. PHP_FUNCTION(grapheme_strrpos)
  189. {
  190. unsigned char *haystack, *needle;
  191. int haystack_len, needle_len;
  192. long loffset = 0;
  193. int32_t offset = 0;
  194. int32_t ret_pos;
  195. int is_ascii;
  196. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  197. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  198. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  199. RETURN_FALSE;
  200. }
  201. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  202. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  203. RETURN_FALSE;
  204. }
  205. /* we checked that it will fit: */
  206. offset = (int32_t) loffset;
  207. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  208. if (needle_len == 0) {
  209. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  210. RETURN_FALSE;
  211. }
  212. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  213. if ( is_ascii ) {
  214. ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
  215. if ( ret_pos >= 0 ) {
  216. RETURN_LONG(ret_pos);
  217. }
  218. /* if the needle was ascii too, we are done */
  219. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  220. RETURN_FALSE;
  221. }
  222. /* else we need to continue via utf16 */
  223. }
  224. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
  225. if ( ret_pos >= 0 ) {
  226. RETURN_LONG(ret_pos);
  227. } else {
  228. RETURN_FALSE;
  229. }
  230. }
  231. /* }}} */
  232. /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
  233. Find position of last occurrence of a string within another, ignoring case */
  234. PHP_FUNCTION(grapheme_strripos)
  235. {
  236. unsigned char *haystack, *needle;
  237. int haystack_len, needle_len;
  238. long loffset = 0;
  239. int32_t offset = 0;
  240. int32_t ret_pos;
  241. int is_ascii;
  242. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
  243. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  244. "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
  245. RETURN_FALSE;
  246. }
  247. if ( OUTSIDE_STRING(loffset, haystack_len) ) {
  248. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
  249. RETURN_FALSE;
  250. }
  251. /* we checked that it will fit: */
  252. offset = (int32_t) loffset;
  253. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  254. if (needle_len == 0) {
  255. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  256. RETURN_FALSE;
  257. }
  258. is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
  259. if ( is_ascii ) {
  260. unsigned char *needle_dup, *haystack_dup;
  261. needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
  262. php_strtolower((char *)needle_dup, needle_len);
  263. haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
  264. php_strtolower((char *)haystack_dup, haystack_len);
  265. ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
  266. efree(haystack_dup);
  267. efree(needle_dup);
  268. if ( ret_pos >= 0 ) {
  269. RETURN_LONG(ret_pos);
  270. }
  271. /* if the needle was ascii too, we are done */
  272. if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
  273. RETURN_FALSE;
  274. }
  275. /* else we need to continue via utf16 */
  276. }
  277. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
  278. if ( ret_pos >= 0 ) {
  279. RETURN_LONG(ret_pos);
  280. } else {
  281. RETURN_FALSE;
  282. }
  283. }
  284. /* }}} */
  285. /* {{{ proto string grapheme_substr(string str, int start [, int length])
  286. Returns part of a string */
  287. PHP_FUNCTION(grapheme_substr)
  288. {
  289. unsigned char *str, *sub_str;
  290. UChar *ustr;
  291. int str_len, sub_str_len, ustr_len;
  292. long lstart = 0, length = 0;
  293. int32_t start = 0;
  294. int iter_val;
  295. UErrorCode status;
  296. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  297. UBreakIterator* bi = NULL;
  298. int sub_str_start_pos, sub_str_end_pos;
  299. int32_t (*iter_func)(UBreakIterator *);
  300. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
  301. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  302. "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
  303. RETURN_FALSE;
  304. }
  305. if ( OUTSIDE_STRING(lstart, str_len) ) {
  306. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  307. RETURN_FALSE;
  308. }
  309. /* we checked that it will fit: */
  310. start = (int32_t) lstart;
  311. /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
  312. if ( grapheme_ascii_check(str, str_len) >= 0 ) {
  313. grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
  314. if ( NULL == sub_str ) {
  315. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
  316. RETURN_FALSE;
  317. }
  318. RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
  319. }
  320. ustr = NULL;
  321. ustr_len = 0;
  322. status = U_ZERO_ERROR;
  323. intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
  324. if ( U_FAILURE( status ) ) {
  325. /* Set global error code. */
  326. intl_error_set_code( NULL, status TSRMLS_CC );
  327. /* Set error messages. */
  328. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  329. if (ustr) {
  330. efree( ustr );
  331. }
  332. RETURN_FALSE;
  333. }
  334. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
  335. if( U_FAILURE(status) ) {
  336. RETURN_FALSE;
  337. }
  338. ubrk_setText(bi, ustr, ustr_len, &status);
  339. if ( start < 0 ) {
  340. iter_func = ubrk_previous;
  341. ubrk_last(bi);
  342. iter_val = 1;
  343. }
  344. else {
  345. iter_func = ubrk_next;
  346. iter_val = -1;
  347. }
  348. sub_str_start_pos = 0;
  349. while ( start ) {
  350. sub_str_start_pos = iter_func(bi);
  351. if ( UBRK_DONE == sub_str_start_pos ) {
  352. break;
  353. }
  354. start += iter_val;
  355. }
  356. if ( 0 != start || sub_str_start_pos >= ustr_len ) {
  357. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
  358. if (ustr) {
  359. efree(ustr);
  360. }
  361. ubrk_close(bi);
  362. RETURN_FALSE;
  363. }
  364. if (ZEND_NUM_ARGS() <= 2) {
  365. /* no length supplied, return the rest of the string */
  366. sub_str = NULL;
  367. sub_str_len = 0;
  368. status = U_ZERO_ERROR;
  369. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
  370. if (ustr) {
  371. efree( ustr );
  372. }
  373. ubrk_close( bi );
  374. if ( U_FAILURE( status ) ) {
  375. /* Set global error code. */
  376. intl_error_set_code( NULL, status TSRMLS_CC );
  377. /* Set error messages. */
  378. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
  379. if (sub_str) {
  380. efree( sub_str );
  381. }
  382. RETURN_FALSE;
  383. }
  384. /* return the allocated string, not a duplicate */
  385. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  386. }
  387. if(length == 0) {
  388. /* empty length - we've validated start, we can return "" now */
  389. if (ustr) {
  390. efree(ustr);
  391. }
  392. ubrk_close(bi);
  393. RETURN_EMPTY_STRING();
  394. }
  395. /* find the end point of the string to return */
  396. if ( length < 0 ) {
  397. iter_func = ubrk_previous;
  398. ubrk_last(bi);
  399. iter_val = 1;
  400. }
  401. else {
  402. iter_func = ubrk_next;
  403. iter_val = -1;
  404. }
  405. sub_str_end_pos = 0;
  406. while ( length ) {
  407. sub_str_end_pos = iter_func(bi);
  408. if ( UBRK_DONE == sub_str_end_pos ) {
  409. break;
  410. }
  411. length += iter_val;
  412. }
  413. ubrk_close(bi);
  414. if ( UBRK_DONE == sub_str_end_pos) {
  415. if(length < 0) {
  416. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
  417. efree(ustr);
  418. RETURN_FALSE;
  419. } else {
  420. sub_str_end_pos = ustr_len;
  421. }
  422. }
  423. if(sub_str_start_pos > sub_str_end_pos) {
  424. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
  425. efree(ustr);
  426. RETURN_FALSE;
  427. }
  428. sub_str = NULL;
  429. status = U_ZERO_ERROR;
  430. intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
  431. efree( ustr );
  432. if ( U_FAILURE( status ) ) {
  433. /* Set global error code. */
  434. intl_error_set_code( NULL, status TSRMLS_CC );
  435. /* Set error messages. */
  436. intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
  437. if ( NULL != sub_str )
  438. efree( sub_str );
  439. RETURN_FALSE;
  440. }
  441. /* return the allocated string, not a duplicate */
  442. RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
  443. }
  444. /* }}} */
  445. /* {{{ strstr_common_handler */
  446. static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
  447. {
  448. unsigned char *haystack, *needle, *found;
  449. int haystack_len, needle_len;
  450. int ret_pos, uchar_pos;
  451. zend_bool part = 0;
  452. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
  453. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  454. "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
  455. RETURN_FALSE;
  456. }
  457. if (needle_len == 0) {
  458. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
  459. RETURN_FALSE;
  460. }
  461. if ( !f_ignore_case ) {
  462. /* ASCII optimization: quick check to see if the string might be there
  463. * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
  464. */
  465. found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
  466. /* if it isn't there the we are done */
  467. if ( !found ) {
  468. RETURN_FALSE;
  469. }
  470. /* if it is there, and if the haystack is ascii, we are all done */
  471. if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
  472. size_t found_offset = found - haystack;
  473. if (part) {
  474. RETURN_STRINGL(((char *)haystack) , found_offset, 1);
  475. } else {
  476. RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
  477. }
  478. }
  479. }
  480. /* need to work in utf16 */
  481. ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
  482. if ( ret_pos < 0 ) {
  483. RETURN_FALSE;
  484. }
  485. /* uchar_pos is the 'nth' Unicode character position of the needle */
  486. ret_pos = 0;
  487. U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
  488. if (part) {
  489. RETURN_STRINGL(((char *)haystack), ret_pos, 1);
  490. }
  491. else {
  492. RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
  493. }
  494. }
  495. /* }}} */
  496. /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
  497. Finds first occurrence of a string within another */
  498. PHP_FUNCTION(grapheme_strstr)
  499. {
  500. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
  501. }
  502. /* }}} */
  503. /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
  504. Finds first occurrence of a string within another */
  505. PHP_FUNCTION(grapheme_stristr)
  506. {
  507. strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
  508. }
  509. /* }}} */
  510. /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
  511. static inline int32_t
  512. grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
  513. {
  514. int pos = 0, prev_pos = 0;
  515. int ret_pos = 0, prev_ret_pos = 0;
  516. while ( 1 ) {
  517. pos = ubrk_next(bi);
  518. if ( UBRK_DONE == pos ) {
  519. break;
  520. }
  521. /* if we are beyond our limit, then the loop is done */
  522. if ( pos > csize ) {
  523. break;
  524. }
  525. /* update our pointer in the original UTF-8 buffer by as many characters
  526. as ubrk_next iterated over */
  527. prev_ret_pos = ret_pos;
  528. U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
  529. if ( prev_ret_pos == ret_pos ) {
  530. /* something wrong - malformed utf8? */
  531. break;
  532. }
  533. prev_pos = pos;
  534. }
  535. return ret_pos;
  536. }
  537. /* }}} */
  538. /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
  539. static inline int32_t
  540. grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
  541. {
  542. int pos = 0, prev_pos = 0;
  543. int ret_pos = 0, prev_ret_pos = 0;
  544. while ( 1 ) {
  545. pos = ubrk_next(bi);
  546. if ( UBRK_DONE == pos ) {
  547. break;
  548. }
  549. prev_ret_pos = ret_pos;
  550. U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
  551. if ( ret_pos > bsize ) {
  552. ret_pos = prev_ret_pos;
  553. break;
  554. }
  555. if ( prev_ret_pos == ret_pos ) {
  556. /* something wrong - malformed utf8? */
  557. break;
  558. }
  559. prev_pos = pos;
  560. }
  561. return ret_pos;
  562. }
  563. /* }}} */
  564. /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
  565. static inline int32_t
  566. grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
  567. {
  568. int pos = 0, next_pos = 0;
  569. int ret_pos = 0;
  570. while ( size ) {
  571. next_pos = ubrk_next(bi);
  572. if ( UBRK_DONE == next_pos ) {
  573. break;
  574. }
  575. pos = next_pos;
  576. size--;
  577. }
  578. /* pos is one past the last UChar - and represent the number of code units to
  579. advance in the utf-8 buffer
  580. */
  581. U8_FWD_N(pstr, ret_pos, str_len, pos);
  582. return ret_pos;
  583. }
  584. /* }}} */
  585. /* {{{ grapheme extract iter function pointer array */
  586. typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
  587. static grapheme_extract_iter grapheme_extract_iters[] = {
  588. &grapheme_extract_count_iter,
  589. &grapheme_extract_bytecount_iter,
  590. &grapheme_extract_charcount_iter,
  591. };
  592. /* }}} */
  593. /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
  594. Function to extract a sequence of default grapheme clusters */
  595. PHP_FUNCTION(grapheme_extract)
  596. {
  597. unsigned char *str, *pstr;
  598. UChar *ustr;
  599. int str_len, ustr_len;
  600. long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
  601. long lstart = 0; /* starting position in str in bytes */
  602. int32_t start = 0;
  603. long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
  604. UErrorCode status;
  605. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  606. UBreakIterator* bi = NULL;
  607. int ret_pos;
  608. zval *next = NULL; /* return offset of next part of the string */
  609. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
  610. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  611. "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
  612. RETURN_FALSE;
  613. }
  614. if ( NULL != next ) {
  615. if ( !PZVAL_IS_REF(next) ) {
  616. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  617. "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
  618. RETURN_FALSE;
  619. }
  620. else {
  621. /* initialize next */
  622. zval_dtor(next);
  623. ZVAL_LONG(next, lstart);
  624. }
  625. }
  626. if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
  627. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  628. "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
  629. RETURN_FALSE;
  630. }
  631. if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
  632. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
  633. RETURN_FALSE;
  634. }
  635. if ( size > INT32_MAX || size < 0) {
  636. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
  637. RETURN_FALSE;
  638. }
  639. if (size == 0) {
  640. RETURN_EMPTY_STRING();
  641. }
  642. /* we checked that it will fit: */
  643. start = (int32_t) lstart;
  644. pstr = str + start;
  645. /* just in case pstr points in the middle of a character, move forward to the start of the next char */
  646. if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  647. unsigned char *str_end = str + str_len;
  648. while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
  649. pstr++;
  650. if ( pstr >= str_end ) {
  651. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  652. "grapheme_extract: invalid input string", 0 TSRMLS_CC );
  653. RETURN_FALSE;
  654. }
  655. }
  656. }
  657. str_len -= (pstr - str);
  658. /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
  659. (size + 1 because the size-th character might be the beginning of a grapheme cluster)
  660. */
  661. if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
  662. long nsize = ( size < str_len ? size : str_len );
  663. if ( NULL != next ) {
  664. ZVAL_LONG(next, start+nsize);
  665. }
  666. RETURN_STRINGL(((char *)pstr), nsize, 1);
  667. }
  668. /* convert the strings to UTF-16. */
  669. ustr = NULL;
  670. ustr_len = 0;
  671. status = U_ZERO_ERROR;
  672. intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
  673. if ( U_FAILURE( status ) ) {
  674. /* Set global error code. */
  675. intl_error_set_code( NULL, status TSRMLS_CC );
  676. /* Set error messages. */
  677. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  678. if ( NULL != ustr )
  679. efree( ustr );
  680. RETURN_FALSE;
  681. }
  682. bi = NULL;
  683. status = U_ZERO_ERROR;
  684. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
  685. ubrk_setText(bi, ustr, ustr_len, &status);
  686. /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
  687. can't back up. So, we will not do anything. */
  688. /* now we need to find the end of the chunk the user wants us to return */
  689. ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
  690. if (ustr) {
  691. efree(ustr);
  692. }
  693. ubrk_close(bi);
  694. if ( NULL != next ) {
  695. ZVAL_LONG(next, start+ret_pos);
  696. }
  697. RETURN_STRINGL(((char *)pstr), ret_pos, 1);
  698. }
  699. /* }}} */
  700. /*
  701. * Local variables:
  702. * tab-width: 4
  703. * c-basic-offset: 4
  704. * End:
  705. * vim600: fdm=marker
  706. * vim: noet sw=4 ts=4
  707. */