PageRenderTime 43ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/hphp/runtime/base/zend-collator.cpp

https://gitlab.com/Blueprint-Marketing/hhvm
C++ | 630 lines | 501 code | 71 blank | 58 comment | 165 complexity | 571a4bce02d616a9110c8b19b19c791f MD5 | raw file
  1. /*
  2. +----------------------------------------------------------------------+
  3. | HipHop for PHP |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
  6. | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
  7. +----------------------------------------------------------------------+
  8. | This source file is subject to version 2.00 of the Zend license, |
  9. | that is bundled with this package in the file LICENSE, and is |
  10. | available through the world-wide-web at the following url: |
  11. | http://www.zend.com/license/2_00.txt. |
  12. | If you did not receive a copy of the Zend license and are unable to |
  13. | obtain it through the world-wide-web, please send a note to |
  14. | license@zend.com so we can mail you a copy immediately. |
  15. +----------------------------------------------------------------------+
  16. */
  17. #include "hphp/runtime/base/zend-collator.h"
  18. #include "hphp/runtime/base/zend-strtod.h"
  19. #include "hphp/runtime/base/intl-convert.h"
  20. #include "hphp/runtime/base/type-conversions.h"
  21. #include "hphp/runtime/base/builtin-functions.h"
  22. #include "hphp/runtime/base/types.h"
  23. #include "hphp/runtime/base/runtime-error.h"
  24. #include "hphp/runtime/base/array-iterator.h"
  25. #include "hphp/runtime/base/comparisons.h"
  26. namespace HPHP {
  27. #define UCHARS(len) ((len) / sizeof(UChar))
  28. #define UBYTES(len) ((len) * sizeof(UChar))
  29. static Variant collator_convert_string_to_number_if_possible(const Variant& str);
  30. static double collator_u_strtod(const UChar *nptr, UChar **endptr) {
  31. const UChar *u = nptr, *nstart;
  32. UChar c = *u;
  33. int any = 0;
  34. while (u_isspace(c)) {
  35. c = *++u;
  36. }
  37. nstart = u;
  38. if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) {
  39. c = *++u;
  40. }
  41. while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
  42. any = 1;
  43. c = *++u;
  44. }
  45. if (c == 0x2E /*'.'*/) {
  46. c = *++u;
  47. while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
  48. any = 1;
  49. c = *++u;
  50. }
  51. }
  52. if ((c == 0x65 /*'e'*/ || c == 0x45 /*'E'*/) && any) {
  53. const UChar *e = u;
  54. int any_exp = 0;
  55. c = *++u;
  56. if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) {
  57. c = *++u;
  58. }
  59. while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
  60. any_exp = 1;
  61. c = *++u;
  62. }
  63. if (!any_exp) {
  64. u = e;
  65. }
  66. }
  67. if (any) {
  68. char buf[64], *numbuf, *bufpos;
  69. int length = u - nstart;
  70. double value;
  71. if (length < (int)sizeof(buf)) {
  72. numbuf = buf;
  73. } else {
  74. numbuf = (char *) smart_malloc(length + 1);
  75. }
  76. bufpos = numbuf;
  77. while (nstart < u) {
  78. *bufpos++ = (char) *nstart++;
  79. }
  80. *bufpos = '\0';
  81. value = zend_strtod(numbuf, nullptr);
  82. if (numbuf != buf) {
  83. smart_free(numbuf);
  84. }
  85. if (endptr != nullptr) {
  86. *endptr = (UChar *)u;
  87. }
  88. return value;
  89. }
  90. if (endptr != nullptr) {
  91. *endptr = (UChar *)nptr;
  92. }
  93. return 0;
  94. }
  95. static long collator_u_strtol(const UChar *nptr, UChar **endptr,
  96. int base) {
  97. const UChar *s = nptr;
  98. unsigned long acc;
  99. UChar c;
  100. unsigned long cutoff;
  101. int neg = 0, any, cutlim;
  102. if (s == nullptr) {
  103. errno = ERANGE;
  104. if (endptr != nullptr) {
  105. *endptr = nullptr;
  106. }
  107. return 0;
  108. }
  109. /*
  110. * Skip white space and pick up leading +/- sign if any.
  111. * If base is 0, allow 0x for hex and 0 for octal, else
  112. * assume decimal; if base is already 16, allow 0x.
  113. */
  114. do {
  115. c = *s++;
  116. } while (u_isspace(c));
  117. if (c == 0x2D /*'-'*/) {
  118. neg = 1;
  119. c = *s++;
  120. } else if (c == 0x2B /*'+'*/)
  121. c = *s++;
  122. if ((base == 0 || base == 16) &&
  123. (c == 0x30 /*'0'*/)
  124. && (*s == 0x78 /*'x'*/ || *s == 0x58 /*'X'*/)) {
  125. c = s[1];
  126. s += 2;
  127. base = 16;
  128. }
  129. if (base == 0)
  130. base = (c == 0x30 /*'0'*/) ? 8 : 10;
  131. /*
  132. * Compute the cutoff value between legal numbers and illegal
  133. * numbers. That is the largest legal value, divided by the
  134. * base. An input number that is greater than this value, if
  135. * followed by a legal input character, is too big. One that
  136. * is equal to this value may be valid or not; the limit
  137. * between valid and invalid numbers is then based on the last
  138. * digit. For instance, if the range for longs is
  139. * [-2147483648..2147483647] and the input base is 10,
  140. * cutoff will be set to 214748364 and cutlim to either
  141. * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
  142. * a value > 214748364, or equal but the next digit is > 7 (or 8),
  143. * the number is too big, and we will return a range error.
  144. *
  145. * Set any if any `digits' consumed; make it negative to indicate
  146. * overflow.
  147. */
  148. cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
  149. cutlim = cutoff % (unsigned long)base;
  150. cutoff /= (unsigned long)base;
  151. for (acc = 0, any = 0;; c = *s++) {
  152. if (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/)
  153. c -= 0x30 /*'0'*/;
  154. else if (c >= 0x41 /*'A'*/ && c <= 0x5A /*'Z'*/)
  155. c -= 0x41 /*'A'*/ - 10;
  156. else if (c >= 0x61 /*'a'*/ && c <= 0x7A /*'z'*/)
  157. c -= 0x61 /*'a'*/ - 10;
  158. else
  159. break;
  160. if (c >= base)
  161. break;
  162. if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
  163. any = -1;
  164. else {
  165. any = 1;
  166. acc *= base;
  167. acc += c;
  168. }
  169. }
  170. if (any < 0) {
  171. acc = neg ? LONG_MIN : LONG_MAX;
  172. errno = ERANGE;
  173. } else if (neg)
  174. acc = -acc;
  175. if (endptr != nullptr)
  176. *endptr = (UChar *)(any ? s - 1 : nptr);
  177. return (acc);
  178. }
  179. static DataType collator_is_numeric(UChar *str, int length, int64_t *lval,
  180. double *dval, int allow_errors ) {
  181. int64_t local_lval;
  182. double local_dval;
  183. UChar *end_ptr_long, *end_ptr_double;
  184. int conv_base=10;
  185. if (!length) {
  186. return KindOfNull;
  187. }
  188. /* handle hex numbers */
  189. if (length>=2 && str[0]=='0' && (str[1]=='x' || str[1]=='X')) {
  190. conv_base=16;
  191. }
  192. errno=0;
  193. local_lval = collator_u_strtol(str, &end_ptr_long, conv_base);
  194. if (errno != ERANGE) {
  195. if (end_ptr_long == str+length) { /* integer string */
  196. if (lval) {
  197. *lval = local_lval;
  198. }
  199. return KindOfInt64;
  200. } else if (end_ptr_long == str &&
  201. *end_ptr_long != '\0' &&
  202. *str != '.' &&
  203. *str != '-') { /* ignore partial string matches */
  204. return KindOfNull;
  205. }
  206. } else {
  207. end_ptr_long = nullptr;
  208. }
  209. if (conv_base == 16) { /* hex string, under UNIX strtod() messes it up */
  210. /* UTODO: keep compatibility with is_numeric_string() here? */
  211. return KindOfNull;
  212. }
  213. local_dval = collator_u_strtod(str, &end_ptr_double);
  214. if (local_dval == 0 && end_ptr_double == str) {
  215. end_ptr_double = nullptr;
  216. } else {
  217. if (end_ptr_double == str+length) { /* floating point string */
  218. if (!finite(local_dval)) {
  219. /* "inf","nan" and maybe other weird ones */
  220. return KindOfNull;
  221. }
  222. if (dval) {
  223. *dval = local_dval;
  224. }
  225. return KindOfDouble;
  226. }
  227. }
  228. if (!allow_errors) {
  229. return KindOfNull;
  230. }
  231. if (allow_errors == -1) {
  232. raise_notice("A non well formed numeric value encountered");
  233. }
  234. if (allow_errors) {
  235. if (end_ptr_double > end_ptr_long && dval) {
  236. *dval = local_dval;
  237. return KindOfDouble;
  238. } else if (end_ptr_long && lval) {
  239. *lval = local_lval;
  240. return KindOfInt64;
  241. }
  242. }
  243. return KindOfNull;
  244. }
  245. static String intl_convert_str_utf8_to_utf16(const String& utf8_str,
  246. UErrorCode * status) {
  247. UChar* ustr = nullptr;
  248. int ustr_len = 0;
  249. intl_convert_utf8_to_utf16(&ustr, &ustr_len,
  250. utf8_str.data(), utf8_str.length(),
  251. status);
  252. if (U_FAILURE(*status)) {
  253. return (const char *)(L"");
  254. }
  255. return String((char*)ustr, UBYTES(ustr_len), AttachString);
  256. }
  257. static String intl_convert_str_utf16_to_utf8(const String& utf16_str,
  258. UErrorCode * status) {
  259. char* str = nullptr;
  260. int str_len = 0;
  261. intl_convert_utf16_to_utf8(&str, &str_len,
  262. (UChar*)(utf16_str.data()),
  263. UCHARS(utf16_str.length()),
  264. status);
  265. if (U_FAILURE(*status)) {
  266. return "";
  267. }
  268. return String(str, str_len, AttachString);
  269. }
  270. static Variant collator_convert_string_to_number(const Variant& str) {
  271. Variant num = collator_convert_string_to_number_if_possible(str);
  272. if (same(num, false)) {
  273. /* String wasn't converted => return zero. */
  274. return 0;
  275. }
  276. return num;
  277. }
  278. static Variant collator_convert_string_to_double(const Variant& str) {
  279. Variant num = collator_convert_string_to_number(str);
  280. return num.toDouble();
  281. }
  282. static Variant collator_convert_string_to_number_if_possible(const Variant& str) {
  283. int64_t lval = 0;
  284. double dval = 0;
  285. if (!str.isString()) return false;
  286. DataType ret = collator_is_numeric((UChar*)(str.toString().data()),
  287. UCHARS(str.toString().length()),
  288. &lval, &dval, 1);
  289. if (ret == KindOfInt64) return lval;
  290. if (ret == KindOfDouble) return dval;
  291. return false;
  292. }
  293. static Variant collator_convert_object_to_string(const Variant& obj) {
  294. if (!obj.isObject()) return obj;
  295. String str;
  296. try {
  297. str = obj.toString();
  298. } catch (Exception &e) {
  299. return obj;
  300. }
  301. UErrorCode status;
  302. String ustr = intl_convert_str_utf8_to_utf16(str, &status);
  303. if (U_FAILURE(status)) {
  304. raise_warning("Error casting object to string in "
  305. "collator_convert_object_to_string()");
  306. return uninit_null();
  307. }
  308. return ustr;
  309. }
  310. static void collator_convert_array_from_utf16_to_utf8(Array &array,
  311. UErrorCode * status) {
  312. for (ArrayIter iter(array); iter; ++iter) {
  313. const Variant& value = iter.secondRef();
  314. /* Process string values only. */
  315. if (!value.isString()) continue;
  316. String str = intl_convert_str_utf16_to_utf8(value.toString(), status);
  317. if (U_FAILURE(*status)) {
  318. return;
  319. }
  320. /* Update current value with the converted value. */
  321. Variant key = iter.first();
  322. array.set(key, str);
  323. }
  324. }
  325. static void collator_convert_array_from_utf8_to_utf16(Array &array,
  326. UErrorCode * status) {
  327. for (ArrayIter iter(array); iter; ++iter) {
  328. const Variant& value = iter.secondRef();
  329. /* Process string values only. */
  330. if (!value.isString()) continue;
  331. String str = intl_convert_str_utf8_to_utf16(value.toString(), status);
  332. if (U_FAILURE(*status)) {
  333. return;
  334. }
  335. /* Update current value with the converted value. */
  336. Variant key = iter.first();
  337. array.set(key, str);
  338. }
  339. }
  340. static Variant collator_normalize_sort_argument(const Variant& arg) {
  341. if (!arg.isString()) return arg;
  342. Variant n_arg = collator_convert_string_to_number_if_possible(arg);
  343. if (same(n_arg, false)) {
  344. /* Conversion to number failed. */
  345. UErrorCode status;
  346. n_arg = intl_convert_str_utf16_to_utf8(arg.toString(), &status);
  347. if (U_FAILURE(status)) {
  348. raise_warning("Error converting utf16 to utf8 in "
  349. "collator_normalize_sort_argument()");
  350. }
  351. }
  352. return n_arg;
  353. }
  354. static int collator_regular_compare_function(const Variant& v1, const Variant& v2,
  355. const void *data,
  356. bool ascending) {
  357. Variant str1 = collator_convert_object_to_string(v1);
  358. Variant str2 = collator_convert_object_to_string(v2);
  359. Variant num1;
  360. Variant num2;
  361. Variant norm1;
  362. Variant norm2;
  363. /* If both args are strings AND either of args is not numeric string
  364. * then use ICU-compare. Otherwise PHP-compare. */
  365. if (str1.isString() && str2.isString()) {
  366. num1 = collator_convert_string_to_number_if_possible(str1);
  367. if (!same(num1, false)) {
  368. num2 = collator_convert_string_to_number_if_possible(str2);
  369. }
  370. if (same(num1, false) || same(num2, false)) {
  371. assert(data);
  372. int ret = ucol_strcoll((const UCollator *)data,
  373. (UChar*)(str1.toString().data()),
  374. UCHARS(str1.toString().length()),
  375. (UChar*)(str2.toString().data()),
  376. UCHARS(str2.toString().length()));
  377. return ascending ? ret : (-ret);
  378. }
  379. }
  380. /* num1 is set if str1 and str2 are strings. */
  381. if (!num1.isNull()) {
  382. if (same(num1, false)) {
  383. /* str1 is string but not numeric string just convert it to utf8. */
  384. UErrorCode status;
  385. norm1 = intl_convert_str_utf16_to_utf8(str1.toString(), &status);
  386. if (U_FAILURE(status)) {
  387. raise_warning("Error converting utf16 to utf8 in "
  388. "collator_regular_compare_function()");
  389. }
  390. /* num2 is not set but str2 is string => do normalization. */
  391. norm2 = collator_normalize_sort_argument(str2);
  392. } else {
  393. /* str1 is numeric strings => passthru to PHP-compare. */
  394. norm1 = num1;
  395. norm2 = num2;
  396. }
  397. } else {
  398. /* str1 or str2 is not a string => do normalization. */
  399. norm1 = collator_normalize_sort_argument(str1);
  400. norm2 = collator_normalize_sort_argument(str2);
  401. }
  402. if (ascending) {
  403. if (less(norm1, norm2)) return -1;
  404. if (equal(norm1, norm2)) return 0;
  405. return 1;
  406. }
  407. if (less(norm1, norm2)) return 1;
  408. if (equal(norm1, norm2)) return 0;
  409. return -1;
  410. }
  411. static int collator_regular_compare_ascending(const Variant& v1, const Variant& v2,
  412. const void *data) {
  413. return collator_regular_compare_function(v1, v2, data, true);
  414. }
  415. static int collator_regular_compare_descending(const Variant& v1, const Variant& v2,
  416. const void *data) {
  417. return collator_regular_compare_function(v1, v2, data, false);
  418. }
  419. static int collator_numeric_compare_function(const Variant& v1, const Variant& v2,
  420. const void *data,
  421. bool ascending) {
  422. Variant num1;
  423. Variant num2;
  424. if (v1.isString()) {
  425. num1 = collator_convert_string_to_double(v1);
  426. } else {
  427. num1 = v1.toDouble();
  428. }
  429. if (v2.isString()) {
  430. num2 = collator_convert_string_to_double(v2);
  431. } else {
  432. num2 = v2.toDouble();
  433. }
  434. if (ascending) {
  435. if (less(num1, num2)) return -1;
  436. if (equal(num1, num2)) return 0;
  437. return 1;
  438. }
  439. if (less(num1, num2)) return 1;
  440. if (equal(num1, num2)) return 0;
  441. return -1;
  442. }
  443. static int collator_numeric_compare_ascending(const Variant& v1, const Variant& v2,
  444. const void *data) {
  445. return collator_numeric_compare_function(v1, v2, data, true);
  446. }
  447. static int collator_numeric_compare_descending(const Variant& v1, const Variant& v2,
  448. const void *data) {
  449. return collator_numeric_compare_function(v1, v2, data, false);
  450. }
  451. static int collator_string_compare_function(const Variant& v1, const Variant& v2,
  452. const void *data,
  453. bool ascending) {
  454. assert(data);
  455. String str1;
  456. if (v1.isString()) {
  457. str1 = v1.toString();
  458. } else {
  459. UErrorCode status;
  460. str1 = intl_convert_str_utf8_to_utf16(v1.toString(), &status);
  461. if (U_FAILURE(status)) {
  462. raise_warning("Error converting utf8 to utf16 in "
  463. "collator_string_compare_function()");
  464. }
  465. }
  466. String str2;
  467. if (v2.isString()) {
  468. str2 = v2.toString();
  469. } else {
  470. UErrorCode status;
  471. str2 = intl_convert_str_utf8_to_utf16(v2.toString(), &status);
  472. if (U_FAILURE(status)) {
  473. raise_warning("Error converting utf8 to utf16 in "
  474. "collator_string_compare_function()");
  475. }
  476. }
  477. int ret = ucol_strcoll((const UCollator *)data,
  478. (UChar*)(str1.data()),
  479. UCHARS(str1.length()),
  480. (UChar*)(str2.data()),
  481. UCHARS(str2.length()));
  482. return ascending ? ret : (-ret);
  483. }
  484. static int collator_string_compare_ascending(const Variant& v1, const Variant& v2,
  485. const void *data) {
  486. return collator_string_compare_function(v1, v2, data, true);
  487. }
  488. static int collator_string_compare_descending(const Variant& v1, const Variant& v2,
  489. const void *data) {
  490. return collator_string_compare_function(v1, v2, data, false);
  491. }
  492. static bool collator_sort_internal(bool renumber, Variant &array,
  493. int sort_flags, bool ascending, bool byKey,
  494. UCollator *coll, Intl::IntlError *errcode) {
  495. assert(coll);
  496. errcode->clearError();
  497. Array temp = array.toArray();
  498. Array::PFUNC_CMP cmp_func;
  499. switch (sort_flags) {
  500. case COLLATOR_SORT_NUMERIC:
  501. cmp_func = ascending ? collator_numeric_compare_ascending
  502. : collator_numeric_compare_descending;
  503. break;
  504. case COLLATOR_SORT_STRING:
  505. cmp_func = ascending ? collator_string_compare_ascending
  506. : collator_string_compare_descending;
  507. break;
  508. case COLLATOR_SORT_REGULAR:
  509. default:
  510. cmp_func = ascending ? collator_regular_compare_ascending
  511. : collator_regular_compare_descending;
  512. break;
  513. }
  514. /* Convert strings in the specified array from UTF-8 to UTF-16. */
  515. UErrorCode error = U_ZERO_ERROR;
  516. collator_convert_array_from_utf8_to_utf16(temp, &error);
  517. if (U_FAILURE(error)) {
  518. errcode->setError(error, "Error converting array from UTF-8 to UTF-16");
  519. return false;
  520. }
  521. /* Sort specified array. */
  522. temp.sort(cmp_func, byKey, renumber, coll);
  523. /* Convert strings in the specified array back to UTF-8. */
  524. errcode->clearError();
  525. error = U_ZERO_ERROR;
  526. collator_convert_array_from_utf16_to_utf8(temp, &error);
  527. if (U_FAILURE(error)) {
  528. errcode->setError(error, "Error converting array from UTF-16 to UTF-8");
  529. return false;
  530. }
  531. array = temp;
  532. return true;
  533. }
  534. bool collator_sort(Variant &array, int sort_flags, bool ascending,
  535. UCollator *coll, Intl::IntlError *errcode) {
  536. assert(coll);
  537. bool byKey = false;
  538. bool ret = collator_sort_internal(true, array, sort_flags, ascending, byKey,
  539. coll, errcode);
  540. return ret;
  541. }
  542. bool collator_asort(Variant &array, int sort_flags, bool ascending,
  543. UCollator *coll, Intl::IntlError *errcode) {
  544. assert(coll);
  545. bool byKey = false;
  546. bool ret = collator_sort_internal(false, array, sort_flags, ascending, byKey,
  547. coll, errcode);
  548. return ret;
  549. }
  550. bool collator_ksort(Variant &array, int sort_flags, bool ascending,
  551. UCollator *coll, Intl::IntlError *errcode) {
  552. assert(coll);
  553. bool byKey = true;
  554. bool ret = collator_sort_internal(false, array, sort_flags, ascending, byKey,
  555. coll, errcode);
  556. return ret;
  557. }
  558. ///////////////////////////////////////////////////////////////////////////////
  559. }