PageRenderTime 40ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/ext/intl/normalizer/normalizer_normalize.c

http://github.com/php/php-src
C | 359 lines | 231 code | 63 blank | 65 comment | 29 complexity | bddd0531a4984d1780be0c2582917c66 MD5 | raw file
Possible License(s): BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. /*
  2. +----------------------------------------------------------------------+
  3. | This source file is subject to version 3.01 of the PHP license, |
  4. | that is bundled with this package in the file LICENSE, and is |
  5. | available through the world-wide-web at the following url: |
  6. | http://www.php.net/license/3_01.txt |
  7. | If you did not receive a copy of the PHP license and are unable to |
  8. | obtain it through the world-wide-web, please send a note to |
  9. | license@php.net so we can mail you a copy immediately. |
  10. +----------------------------------------------------------------------+
  11. | Authors: Ed Batutis <ed@batutis.com> |
  12. +----------------------------------------------------------------------+
  13. */
  14. #ifdef HAVE_CONFIG_H
  15. #include "config.h"
  16. #endif
  17. #include "php_intl.h"
  18. #if U_ICU_VERSION_MAJOR_NUM < 56
  19. #include "unicode/unorm.h"
  20. #else
  21. #include <unicode/unorm2.h>
  22. #endif
  23. #include "normalizer.h"
  24. #include "normalizer_class.h"
  25. #include "intl_convert.h"
  26. #include <unicode/utf8.h>
  27. #if U_ICU_VERSION_MAJOR_NUM >= 56
  28. static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
  29. {/*{{{*/
  30. switch (form)
  31. {
  32. case NORMALIZER_FORM_C:
  33. return unorm2_getNFCInstance(err);
  34. break;
  35. case NORMALIZER_FORM_D:
  36. return unorm2_getNFDInstance(err);
  37. break;
  38. case NORMALIZER_FORM_KC:
  39. return unorm2_getNFKCInstance(err);
  40. break;
  41. case NORMALIZER_FORM_KD:
  42. return unorm2_getNFKDInstance(err);
  43. break;
  44. case NORMALIZER_FORM_KC_CF:
  45. return unorm2_getNFKCCasefoldInstance(err);
  46. break;
  47. }
  48. *err = U_ILLEGAL_ARGUMENT_ERROR;
  49. return NULL;
  50. }/*}}}*/
  51. static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
  52. {/*{{{*/
  53. const UNormalizer2 *norm = intl_get_normalizer(form, err);
  54. if (U_FAILURE(*err)) {
  55. return -1;
  56. }
  57. return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
  58. }/*}}}*/
  59. static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
  60. {/*{{{*/
  61. const UNormalizer2 *norm = intl_get_normalizer(form, err);
  62. if(U_FAILURE(*err)) {
  63. return FALSE;
  64. }
  65. return unorm2_isNormalized(norm, uinput, uinput_len, err);
  66. }/*}}}*/
  67. #endif
  68. /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
  69. * Normalize a string. }}} */
  70. /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
  71. * Normalize a string.
  72. */
  73. PHP_FUNCTION( normalizer_normalize )
  74. {
  75. char* input = NULL;
  76. /* form is optional, defaults to FORM_C */
  77. zend_long form = NORMALIZER_DEFAULT;
  78. size_t input_len = 0;
  79. UChar* uinput = NULL;
  80. int32_t uinput_len = 0;
  81. int expansion_factor = 1;
  82. UErrorCode status = U_ZERO_ERROR;
  83. UChar* uret_buf = NULL;
  84. int32_t uret_len = 0;
  85. zend_string* u8str;
  86. int32_t size_needed;
  87. intl_error_reset( NULL );
  88. /* Parse parameters. */
  89. if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
  90. &input, &input_len, &form ) == FAILURE )
  91. {
  92. RETURN_THROWS();
  93. }
  94. expansion_factor = 1;
  95. switch(form) {
  96. case NORMALIZER_FORM_D:
  97. expansion_factor = 3;
  98. break;
  99. case NORMALIZER_FORM_KD:
  100. expansion_factor = 3;
  101. break;
  102. case NORMALIZER_FORM_C:
  103. case NORMALIZER_FORM_KC:
  104. #if U_ICU_VERSION_MAJOR_NUM >= 56
  105. case NORMALIZER_FORM_KC_CF:
  106. #endif
  107. break;
  108. default:
  109. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  110. "normalizer_normalize: illegal normalization form", 0 );
  111. RETURN_FALSE;
  112. }
  113. /*
  114. * Normalize string (converting it to UTF-16 first).
  115. */
  116. /* First convert the string to UTF-16. */
  117. intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
  118. if( U_FAILURE( status ) )
  119. {
  120. /* Set global error code. */
  121. intl_error_set_code( NULL, status );
  122. /* Set error messages. */
  123. intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  124. if (uinput) {
  125. efree( uinput );
  126. }
  127. RETURN_FALSE;
  128. }
  129. /* Allocate memory for the destination buffer for normalization */
  130. uret_len = uinput_len * expansion_factor;
  131. uret_buf = eumalloc( uret_len + 1 );
  132. /* normalize */
  133. #if U_ICU_VERSION_MAJOR_NUM < 56
  134. size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
  135. #else
  136. size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
  137. #endif
  138. /* Bail out if an unexpected error occurred.
  139. * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
  140. * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
  141. */
  142. if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
  143. efree( uret_buf );
  144. efree( uinput );
  145. RETURN_NULL();
  146. }
  147. if ( size_needed > uret_len ) {
  148. /* realloc does not seem to work properly - memory is corrupted
  149. * uret_buf = eurealloc(uret_buf, size_needed + 1);
  150. */
  151. efree( uret_buf );
  152. uret_buf = eumalloc( size_needed + 1 );
  153. uret_len = size_needed;
  154. status = U_ZERO_ERROR;
  155. /* try normalize again */
  156. #if U_ICU_VERSION_MAJOR_NUM < 56
  157. size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
  158. #else
  159. size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
  160. #endif
  161. /* Bail out if an unexpected error occurred. */
  162. if( U_FAILURE(status) ) {
  163. /* Set error messages. */
  164. intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
  165. efree( uret_buf );
  166. efree( uinput );
  167. RETURN_FALSE;
  168. }
  169. }
  170. efree( uinput );
  171. /* the buffer we actually used */
  172. uret_len = size_needed;
  173. /* Convert normalized string from UTF-16 to UTF-8. */
  174. u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
  175. efree( uret_buf );
  176. if( !u8str )
  177. {
  178. intl_error_set( NULL, status,
  179. "normalizer_normalize: error converting normalized text UTF-8", 0 );
  180. RETURN_FALSE;
  181. }
  182. /* Return it. */
  183. RETVAL_NEW_STR( u8str );
  184. }
  185. /* }}} */
  186. /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
  187. * Test if a string is in a given normalization form. }}} */
  188. /* {{{ proto bool normalizer_is_normalized( string $input [, string $form = FORM_C] )
  189. * Test if a string is in a given normalization form.
  190. */
  191. PHP_FUNCTION( normalizer_is_normalized )
  192. {
  193. char* input = NULL;
  194. /* form is optional, defaults to FORM_C */
  195. zend_long form = NORMALIZER_DEFAULT;
  196. size_t input_len = 0;
  197. UChar* uinput = NULL;
  198. int uinput_len = 0;
  199. UErrorCode status = U_ZERO_ERROR;
  200. UBool uret = FALSE;
  201. intl_error_reset( NULL );
  202. /* Parse parameters. */
  203. if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
  204. &input, &input_len, &form) == FAILURE )
  205. {
  206. RETURN_THROWS();
  207. }
  208. switch(form) {
  209. case NORMALIZER_FORM_D:
  210. case NORMALIZER_FORM_KD:
  211. case NORMALIZER_FORM_C:
  212. case NORMALIZER_FORM_KC:
  213. #if U_ICU_VERSION_MAJOR_NUM >= 56
  214. case NORMALIZER_FORM_KC_CF:
  215. #endif
  216. break;
  217. default:
  218. intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  219. "normalizer_normalize: illegal normalization form", 0 );
  220. RETURN_FALSE;
  221. }
  222. /*
  223. * Test normalization of string (converting it to UTF-16 first).
  224. */
  225. /* First convert the string to UTF-16. */
  226. intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
  227. if( U_FAILURE( status ) )
  228. {
  229. /* Set global error code. */
  230. intl_error_set_code( NULL, status );
  231. /* Set error messages. */
  232. intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
  233. if (uinput) {
  234. efree( uinput );
  235. }
  236. RETURN_FALSE;
  237. }
  238. /* test string */
  239. #if U_ICU_VERSION_MAJOR_NUM < 56
  240. uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
  241. #else
  242. uret = intl_is_normalized(form, uinput, uinput_len, &status);
  243. #endif
  244. efree( uinput );
  245. /* Bail out if an unexpected error occurred. */
  246. if( U_FAILURE(status) ) {
  247. /* Set error messages. */
  248. intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
  249. RETURN_FALSE;
  250. }
  251. if ( uret )
  252. RETURN_TRUE;
  253. RETURN_FALSE;
  254. }
  255. /* }}} */
  256. /* {{{ proto string|null Normalizer::getRawDecomposition( string $input [, string $form = FORM_C] )
  257. * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. }}} */
  258. /* {{{ proto string|null normalizer_get_raw_decomposition( string $input [, string $form = FORM_C] )
  259. * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point.
  260. */
  261. #if U_ICU_VERSION_MAJOR_NUM >= 56
  262. PHP_FUNCTION( normalizer_get_raw_decomposition )
  263. {
  264. char* input = NULL;
  265. size_t input_length = 0;
  266. UChar32 codepoint = -1;
  267. int32_t offset = 0;
  268. UErrorCode status = U_ZERO_ERROR;
  269. const UNormalizer2 *norm;
  270. UChar decomposition[32];
  271. int32_t decomposition_length;
  272. zend_long form = NORMALIZER_DEFAULT;
  273. intl_error_reset(NULL);
  274. if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
  275. RETURN_THROWS();
  276. }
  277. norm = intl_get_normalizer(form, &status);
  278. U8_NEXT(input, offset, input_length, codepoint);
  279. if ((size_t)offset != input_length) {
  280. intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
  281. intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
  282. return;
  283. }
  284. if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
  285. intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
  286. intl_error_set_custom_msg(NULL, "Code point out of range", 0);
  287. return;
  288. }
  289. decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
  290. if (decomposition_length == -1) {
  291. RETURN_NULL();
  292. }
  293. RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
  294. }
  295. #endif
  296. /* }}} */