PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/uClinux-dist/lib/libkrb5/lib/krb5/unicode/ucstr.c

https://bitbucket.org/__wp__/mb-linux-msli
C | 452 lines | 356 code | 40 blank | 56 comment | 120 complexity | c54cb93041336273479d541a1ba90755 MD5 | raw file
Possible License(s): AGPL-3.0, GPL-2.0, LGPL-2.0, MPL-2.0, ISC, BSD-3-Clause, LGPL-2.1, MPL-2.0-no-copyleft-exception, 0BSD, CC-BY-SA-3.0, GPL-3.0, LGPL-3.0, AGPL-1.0, Unlicense
  1. /*
  2. * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucstr.c,v 1.40 2008/03/04
  3. * 06:24:05 hyc Exp $
  4. */
  5. /*
  6. * This work is part of OpenLDAP Software <http://www.openldap.org/>.
  7. *
  8. * Copyright 1998-2008 The OpenLDAP Foundation. All rights reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted only as authorized by the OpenLDAP Public
  12. * License.
  13. *
  14. * A copy of this license is available in file LICENSE in the top-level
  15. * directory of the distribution or, alternatively, at
  16. * <http://www.OpenLDAP.org/license.html>.
  17. */
  18. #include "k5-int.h"
  19. #include "k5-utf8.h"
  20. #include "k5-unicode.h"
  21. #include "ucdata/ucdata.h"
  22. #include <ctype.h>
  23. int
  24. krb5int_ucstrncmp(
  25. const krb5_unicode * u1,
  26. const krb5_unicode * u2,
  27. size_t n)
  28. {
  29. for (; 0 < n; ++u1, ++u2, --n) {
  30. if (*u1 != *u2) {
  31. return *u1 < *u2 ? -1 : +1;
  32. }
  33. if (*u1 == 0) {
  34. return 0;
  35. }
  36. }
  37. return 0;
  38. }
  39. int
  40. krb5int_ucstrncasecmp(
  41. const krb5_unicode * u1,
  42. const krb5_unicode * u2,
  43. size_t n)
  44. {
  45. for (; 0 < n; ++u1, ++u2, --n) {
  46. krb5_unicode uu1 = uctolower(*u1);
  47. krb5_unicode uu2 = uctolower(*u2);
  48. if (uu1 != uu2) {
  49. return uu1 < uu2 ? -1 : +1;
  50. }
  51. if (uu1 == 0) {
  52. return 0;
  53. }
  54. }
  55. return 0;
  56. }
  57. krb5_unicode *
  58. krb5int_ucstrnchr(
  59. const krb5_unicode * u,
  60. size_t n,
  61. krb5_unicode c)
  62. {
  63. for (; 0 < n; ++u, --n) {
  64. if (*u == c) {
  65. return (krb5_unicode *) u;
  66. }
  67. }
  68. return NULL;
  69. }
  70. krb5_unicode *
  71. krb5int_ucstrncasechr(
  72. const krb5_unicode * u,
  73. size_t n,
  74. krb5_unicode c)
  75. {
  76. c = uctolower(c);
  77. for (; 0 < n; ++u, --n) {
  78. if (uctolower(*u) == c) {
  79. return (krb5_unicode *) u;
  80. }
  81. }
  82. return NULL;
  83. }
  84. void
  85. krb5int_ucstr2upper(
  86. krb5_unicode * u,
  87. size_t n)
  88. {
  89. for (; 0 < n; ++u, --n) {
  90. *u = uctoupper(*u);
  91. }
  92. }
  93. #define TOUPPER(c) (islower(c) ? toupper(c) : (c))
  94. #define TOLOWER(c) (isupper(c) ? tolower(c) : (c))
  95. krb5_error_code
  96. krb5int_utf8_normalize(
  97. krb5_data * data,
  98. krb5_data ** newdataptr,
  99. unsigned flags)
  100. {
  101. int i, j, len, clen, outpos, ucsoutlen, outsize, last;
  102. char *out = NULL, *outtmp, *s;
  103. krb5_ucs4 *ucs = NULL, *p, *ucsout = NULL;
  104. krb5_data *newdata;
  105. krb5_error_code retval = 0;
  106. static unsigned char mask[] = {
  107. 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01};
  108. unsigned casefold = flags & KRB5_UTF8_CASEFOLD;
  109. unsigned approx = flags & KRB5_UTF8_APPROX;
  110. *newdataptr = NULL;
  111. s = data->data;
  112. len = data->length;
  113. newdata = malloc(sizeof(*newdata));
  114. if (newdata == NULL)
  115. return ENOMEM;
  116. /*
  117. * Should first check to see if string is already in proper normalized
  118. * form. This is almost as time consuming as the normalization though.
  119. */
  120. /* finish off everything up to character before first non-ascii */
  121. if (KRB5_UTF8_ISASCII(s)) {
  122. if (casefold) {
  123. outsize = len + 7;
  124. out = malloc(outsize);
  125. if (out == NULL) {
  126. retval = ENOMEM;
  127. goto cleanup;
  128. }
  129. outpos = 0;
  130. for (i = 1; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) {
  131. out[outpos++] = TOLOWER(s[i - 1]);
  132. }
  133. if (i == len) {
  134. out[outpos++] = TOLOWER(s[len - 1]);
  135. goto cleanup;
  136. }
  137. } else {
  138. for (i = 1; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) {
  139. /* empty */
  140. }
  141. if (i == len) {
  142. newdata->length = len;
  143. newdata->data = malloc(newdata->length + 1);
  144. if (newdata->data == NULL) {
  145. retval = ENOMEM;
  146. goto cleanup;
  147. }
  148. memcpy(newdata->data, s, len);
  149. newdata->data[len] = '\0';
  150. *newdataptr = newdata;
  151. return 0;
  152. }
  153. outsize = len + 7;
  154. out = malloc(outsize);
  155. if (out == NULL) {
  156. retval = ENOMEM;
  157. goto cleanup;
  158. }
  159. outpos = i - 1;
  160. memcpy(out, s, outpos);
  161. }
  162. } else {
  163. outsize = len + 7;
  164. out = malloc(outsize);
  165. if (out == NULL) {
  166. retval = ENOMEM;
  167. goto cleanup;
  168. }
  169. outpos = 0;
  170. i = 0;
  171. }
  172. p = ucs = malloc(len * sizeof(*ucs));
  173. if (ucs == NULL) {
  174. retval = ENOMEM;
  175. goto cleanup;
  176. }
  177. /* convert character before first non-ascii to ucs-4 */
  178. if (i > 0) {
  179. *p = casefold ? TOLOWER(s[i - 1]) : s[i - 1];
  180. p++;
  181. }
  182. /* s[i] is now first non-ascii character */
  183. for (;;) {
  184. /* s[i] is non-ascii */
  185. /* convert everything up to next ascii to ucs-4 */
  186. while (i < len) {
  187. clen = KRB5_UTF8_CHARLEN2(s + i, clen);
  188. if (clen == 0) {
  189. retval = KRB5_ERR_INVALID_UTF8;
  190. goto cleanup;
  191. }
  192. if (clen == 1) {
  193. /* ascii */
  194. break;
  195. }
  196. *p = s[i] & mask[clen];
  197. i++;
  198. for (j = 1; j < clen; j++) {
  199. if ((s[i] & 0xc0) != 0x80) {
  200. retval = KRB5_ERR_INVALID_UTF8;
  201. goto cleanup;
  202. }
  203. *p <<= 6;
  204. *p |= s[i] & 0x3f;
  205. i++;
  206. }
  207. if (casefold) {
  208. *p = uctolower(*p);
  209. }
  210. p++;
  211. }
  212. /* normalize ucs of length p - ucs */
  213. uccompatdecomp(ucs, p - ucs, &ucsout, &ucsoutlen);
  214. if (approx) {
  215. for (j = 0; j < ucsoutlen; j++) {
  216. if (ucsout[j] < 0x80) {
  217. out[outpos++] = ucsout[j];
  218. }
  219. }
  220. } else {
  221. ucsoutlen = uccanoncomp(ucsout, ucsoutlen);
  222. /* convert ucs to utf-8 and store in out */
  223. for (j = 0; j < ucsoutlen; j++) {
  224. /*
  225. * allocate more space if not enough room for 6 bytes and
  226. * terminator
  227. */
  228. if (outsize - outpos < 7) {
  229. outsize = ucsoutlen - j + outpos + 6;
  230. outtmp = realloc(out, outsize);
  231. if (outtmp == NULL) {
  232. retval = ENOMEM;
  233. goto cleanup;
  234. }
  235. out = outtmp;
  236. }
  237. outpos += krb5int_ucs4_to_utf8(ucsout[j], &out[outpos]);
  238. }
  239. }
  240. free(ucsout);
  241. ucsout = NULL;
  242. if (i == len) {
  243. break;
  244. }
  245. last = i;
  246. /* Allocate more space in out if necessary */
  247. if (len - i >= outsize - outpos) {
  248. outsize += 1 + ((len - i) - (outsize - outpos));
  249. outtmp = realloc(out, outsize);
  250. if (outtmp == NULL) {
  251. retval = ENOMEM;
  252. goto cleanup;
  253. }
  254. out = outtmp;
  255. }
  256. /* s[i] is ascii */
  257. /* finish off everything up to char before next non-ascii */
  258. for (i++; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) {
  259. out[outpos++] = casefold ? TOLOWER(s[i - 1]) : s[i - 1];
  260. }
  261. if (i == len) {
  262. out[outpos++] = casefold ? TOLOWER(s[len - 1]) : s[len - 1];
  263. break;
  264. }
  265. /* convert character before next non-ascii to ucs-4 */
  266. *ucs = casefold ? TOLOWER(s[i - 1]) : s[i - 1];
  267. p = ucs + 1;
  268. }
  269. cleanup:
  270. free(ucs);
  271. free(ucsout);
  272. if (retval) {
  273. free(out);
  274. free(newdata);
  275. return retval;
  276. }
  277. out[outpos] = '\0';
  278. newdata->data = out;
  279. newdata->length = outpos;
  280. *newdataptr = newdata;
  281. return 0;
  282. }
  283. /* compare UTF8-strings, optionally ignore casing */
  284. /* slow, should be optimized */
  285. int
  286. krb5int_utf8_normcmp(
  287. const krb5_data * data1,
  288. const krb5_data * data2,
  289. unsigned flags)
  290. {
  291. int i, l1, l2, len, ulen, res = 0;
  292. char *s1, *s2, *done;
  293. krb5_ucs4 *ucs, *ucsout1, *ucsout2;
  294. unsigned casefold = flags & KRB5_UTF8_CASEFOLD;
  295. unsigned norm1 = flags & KRB5_UTF8_ARG1NFC;
  296. unsigned norm2 = flags & KRB5_UTF8_ARG2NFC;
  297. if (data1 == NULL) {
  298. return data2 == NULL ? 0 : -1;
  299. } else if (data2 == NULL) {
  300. return 1;
  301. }
  302. l1 = data1->length;
  303. l2 = data2->length;
  304. len = (l1 < l2) ? l1 : l2;
  305. if (len == 0) {
  306. return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
  307. }
  308. s1 = data1->data;
  309. s2 = data2->data;
  310. done = s1 + len;
  311. while ((s1 < done) && KRB5_UTF8_ISASCII(s1) && KRB5_UTF8_ISASCII(s2)) {
  312. if (casefold) {
  313. char c1 = TOLOWER(*s1);
  314. char c2 = TOLOWER(*s2);
  315. res = c1 - c2;
  316. } else {
  317. res = *s1 - *s2;
  318. }
  319. s1++;
  320. s2++;
  321. if (res) {
  322. /* done unless next character in s1 or s2 is non-ascii */
  323. if (s1 < done) {
  324. if (!KRB5_UTF8_ISASCII(s1) || !KRB5_UTF8_ISASCII(s2)) {
  325. break;
  326. }
  327. } else if (((len < l1) && !KRB5_UTF8_ISASCII(s1)) ||
  328. ((len < l2) && !KRB5_UTF8_ISASCII(s2))) {
  329. break;
  330. }
  331. return res;
  332. }
  333. }
  334. /* We have encountered non-ascii or strings equal up to len */
  335. /* set i to number of iterations */
  336. i = s1 - done + len;
  337. /* passed through loop at least once? */
  338. if (i > 0) {
  339. if (!res && (s1 == done) &&
  340. ((len == l1) || KRB5_UTF8_ISASCII(s1)) &&
  341. ((len == l2) || KRB5_UTF8_ISASCII(s2))) {
  342. /* all ascii and equal up to len */
  343. return l1 - l2;
  344. }
  345. /* rewind one char, and do normalized compare from there */
  346. s1--;
  347. s2--;
  348. l1 -= i - 1;
  349. l2 -= i - 1;
  350. }
  351. /*
  352. * Should first check to see if strings are already in proper normalized
  353. * form.
  354. */
  355. ucs = malloc(((norm1 || l1 > l2) ? l1 : l2) * sizeof(*ucs));
  356. if (ucs == NULL) {
  357. return l1 > l2 ? 1 : -1;/* what to do??? */
  358. }
  359. /*
  360. * XXYYZ: we convert to ucs4 even though -llunicode
  361. * expects ucs2 in an ac_uint4
  362. */
  363. /* convert and normalize 1st string */
  364. for (i = 0, ulen = 0; i < l1; i += len, ulen++) {
  365. if (krb5int_utf8_to_ucs4(s1 + i, &ucs[ulen]) == -1) {
  366. free(ucs);
  367. return -1; /* what to do??? */
  368. }
  369. len = KRB5_UTF8_CHARLEN(s1 + i);
  370. }
  371. if (norm1) {
  372. ucsout1 = ucs;
  373. l1 = ulen;
  374. ucs = malloc(l2 * sizeof(*ucs));
  375. if (ucs == NULL) {
  376. free(ucsout1);
  377. return l1 > l2 ? 1 : -1; /* what to do??? */
  378. }
  379. } else {
  380. uccompatdecomp(ucs, ulen, &ucsout1, &l1);
  381. l1 = uccanoncomp(ucsout1, l1);
  382. }
  383. /* convert and normalize 2nd string */
  384. for (i = 0, ulen = 0; i < l2; i += len, ulen++) {
  385. if (krb5int_utf8_to_ucs4(s2 + i, &ucs[ulen]) == -1) {
  386. free(ucsout1);
  387. free(ucs);
  388. return 1; /* what to do??? */
  389. }
  390. len = KRB5_UTF8_CHARLEN(s2 + i);
  391. }
  392. if (norm2) {
  393. ucsout2 = ucs;
  394. l2 = ulen;
  395. } else {
  396. uccompatdecomp(ucs, ulen, &ucsout2, &l2);
  397. l2 = uccanoncomp(ucsout2, l2);
  398. free(ucs);
  399. }
  400. res = casefold
  401. ? krb5int_ucstrncasecmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2)
  402. : krb5int_ucstrncmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2);
  403. free(ucsout1);
  404. free(ucsout2);
  405. if (res != 0) {
  406. return res;
  407. }
  408. if (l1 == l2) {
  409. return 0;
  410. }
  411. return l1 > l2 ? 1 : -1;
  412. }