PageRenderTime 83ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/src/racket/src/string.c

http://github.com/gmarceau/PLT
C | 5657 lines | 4630 code | 683 blank | 344 comment | 1048 complexity | 0664fdaee9cf399f842278273762caf3 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1
  1. /*
  2. Racket
  3. Copyright (c) 2004-2011 PLT Scheme Inc.
  4. Copyright (c) 1995-2001 Matthew Flatt
  5. This library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Library General Public
  7. License as published by the Free Software Foundation; either
  8. version 2 of the License, or (at your option) any later version.
  9. This library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Library General Public License for more details.
  13. You should have received a copy of the GNU Library General Public
  14. License along with this library; if not, write to the Free
  15. Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  16. Boston, MA 02110-1301 USA.
  17. libscheme
  18. Copyright (c) 1994 Brent Benson
  19. All rights reserved.
  20. */
  21. #include "schpriv.h"
  22. #include "schvers.h"
  23. #include <string.h>
  24. #include <ctype.h>
  25. #ifndef DONT_USE_LOCALE
  26. # include <locale.h>
  27. # ifdef MZ_NO_ICONV
  28. # define USE_ICONV_DLL
  29. # endif
  30. # ifndef USE_ICONV_DLL
  31. # include <iconv.h>
  32. # include <langinfo.h>
  33. # endif
  34. # include <wchar.h>
  35. # include <wctype.h>
  36. # include <errno.h>
  37. # ifdef MACOS_UNICODE_SUPPORT
  38. # include <CoreFoundation/CFString.h>
  39. # include <CoreFoundation/CFLocale.h>
  40. # endif
  41. # ifdef WINDOWS_UNICODE_SUPPORT
  42. # include <windows.h>
  43. # endif
  44. #endif
  45. #ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH
  46. # include "schsys.h"
  47. #endif
  48. #include "schustr.inc"
  49. #ifdef USE_ICONV_DLL
  50. typedef intptr_t iconv_t;
  51. typedef int *(*errno_proc_t)();
  52. typedef size_t (*iconv_proc_t)(iconv_t cd,
  53. char **inbuf, size_t *inbytesleft,
  54. char **outbuf, size_t *outbytesleft);
  55. typedef iconv_t (*iconv_open_proc_t)(const char *tocode, const char *fromcode);
  56. typedef void (*iconv_close_proc_t)(iconv_t cd);
  57. typedef char *(*locale_charset_proc_t)();
  58. static errno_proc_t iconv_errno;
  59. static iconv_proc_t iconv;
  60. static iconv_open_proc_t iconv_open;
  61. static iconv_close_proc_t iconv_close;
  62. static locale_charset_proc_t locale_charset; /* Not used, currently */
  63. #define mzCHK_PROC(x) x
  64. static int get_iconv_errno(void)
  65. {
  66. int *a;
  67. a = iconv_errno();
  68. return *a;
  69. }
  70. # undef HAVE_CODESET
  71. # define HAVE_CODESET 1
  72. # define CODESET 0
  73. # define ICONV_errno get_iconv_errno()
  74. extern wchar_t *scheme_get_dll_path(wchar_t *s);
  75. static int iconv_ready = 0;
  76. static void init_iconv()
  77. {
  78. # ifdef MZ_NO_ICONV
  79. # else
  80. HMODULE m;
  81. m = LoadLibraryW(scheme_get_dll_path(L"iconv.dll"));
  82. if (!m)
  83. m = LoadLibraryW(scheme_get_dll_path(L"libiconv.dll"));
  84. if (!m)
  85. m = LoadLibraryW(scheme_get_dll_path(L"libiconv-2.dll"));
  86. if (!m)
  87. m = LoadLibrary("iconv.dll");
  88. if (!m)
  89. m = LoadLibrary("libiconv.dll");
  90. if (!m)
  91. m = LoadLibrary("libiconv-2.dll");
  92. if (m) {
  93. iconv = (iconv_proc_t)GetProcAddress(m, "libiconv");
  94. iconv_open = (iconv_open_proc_t)GetProcAddress(m, "libiconv_open");
  95. iconv_close = (iconv_close_proc_t)GetProcAddress(m, "libiconv_close");
  96. locale_charset = (locale_charset_proc_t)GetProcAddress(m, "locale_charset");
  97. /* Make sure we have all of them or none: */
  98. if (!iconv || !iconv_open || !iconv_close) {
  99. iconv = NULL;
  100. iconv_open = NULL;
  101. iconv_close = NULL;
  102. }
  103. }
  104. if (iconv) {
  105. iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
  106. if (!iconv_errno) {
  107. /* The iconv.dll distributed with PLT Scheme links to msvcrt.dll.
  108. It's a slighly dangerous assumption that whatever iconv we
  109. found also uses msvcrt.dll. */
  110. m = LoadLibrary("msvcrt.dll");
  111. if (m) {
  112. iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
  113. if (!iconv_errno) {
  114. iconv = NULL;
  115. iconv_open = NULL;
  116. iconv_close = NULL;
  117. }
  118. }
  119. }
  120. }
  121. # endif
  122. iconv_ready = 1;
  123. }
  124. #else
  125. # define ICONV_errno errno
  126. # define iconv_ready 1
  127. # define mzCHK_PROC(x) 1
  128. static void init_iconv() { }
  129. #endif
  130. #ifdef MACOS_UNICODE_SUPPORT
  131. # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
  132. #endif
  133. #ifdef WINDOWS_UNICODE_SUPPORT
  134. # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
  135. #endif
  136. #ifndef mzLOCALE_IS_UTF_8
  137. # define mzLOCALE_IS_UTF_8(s) !mzCHK_PROC(iconv_open)
  138. #endif
  139. #define mzICONV_KIND 0
  140. #define mzUTF8_KIND 1
  141. #define mzUTF8_TO_UTF16_KIND 2
  142. #define mzUTF16_TO_UTF8_KIND 3
  143. typedef struct Scheme_Converter {
  144. Scheme_Object so;
  145. short closed;
  146. short kind;
  147. iconv_t cd;
  148. int permissive;
  149. Scheme_Custodian_Reference *mref;
  150. } Scheme_Converter;
  151. /* locals */
  152. /* These two locale variables are only valid when reset_locale()
  153. is called after continuation marks (and hence parameterization)
  154. may have changed. Similarly, setlocale() is only up-to-date
  155. when reset_locale() has been called. */
  156. THREAD_LOCAL_DECL(static int locale_on);
  157. THREAD_LOCAL_DECL(static void *current_locale_name_ptr);
  158. static void reset_locale(void);
  159. #define current_locale_name ((const mzchar *)current_locale_name_ptr)
  160. #ifdef USE_ICONV_DLL
  161. static char *nl_langinfo(int which)
  162. {
  163. int i;
  164. reset_locale();
  165. if (!current_locale_name)
  166. current_locale_name_ptr = "\0\0\0\0";
  167. if ((current_locale_name[0] == 'C')
  168. && !current_locale_name[1])
  169. return "US-ASCII";
  170. for (i = 0; current_locale_name[i]; i++) {
  171. if (current_locale_name[i] == '.') {
  172. if (current_locale_name[i + 1]) {
  173. int len, j = 0;
  174. char *enc;
  175. i++;
  176. len = scheme_char_strlen(current_locale_name) - i;
  177. enc = (char *)scheme_malloc_atomic(len + 1);
  178. while (current_locale_name[i]) {
  179. if (current_locale_name[i] > 127)
  180. return "UTF-8";
  181. enc[j++] = current_locale_name[i++];
  182. }
  183. enc[j] = 0;
  184. return enc;
  185. }
  186. }
  187. }
  188. return "UTF-8";
  189. }
  190. #endif
  191. #ifdef DONT_USE_LOCALE
  192. # define mz_iconv_nl_langinfo() ""
  193. #else
  194. static char *mz_iconv_nl_langinfo(){
  195. char *s;
  196. # if HAVE_CODESET
  197. s = nl_langinfo(CODESET);
  198. # else
  199. s = NULL;
  200. # endif
  201. if (!s)
  202. return "";
  203. else
  204. return s;
  205. }
  206. #endif
  207. READ_ONLY static const char * const STRING_IS_NOT_UTF_8 = "string is not a well-formed UTF-8 encoding: ";
  208. static Scheme_Object *make_string (int argc, Scheme_Object *argv[]);
  209. static Scheme_Object *string (int argc, Scheme_Object *argv[]);
  210. static Scheme_Object *string_p (int argc, Scheme_Object *argv[]);
  211. static Scheme_Object *string_length (int argc, Scheme_Object *argv[]);
  212. static Scheme_Object *string_eq (int argc, Scheme_Object *argv[]);
  213. static Scheme_Object *string_locale_eq (int argc, Scheme_Object *argv[]);
  214. static Scheme_Object *string_ci_eq (int argc, Scheme_Object *argv[]);
  215. static Scheme_Object *string_locale_ci_eq (int argc, Scheme_Object *argv[]);
  216. static Scheme_Object *string_lt (int argc, Scheme_Object *argv[]);
  217. static Scheme_Object *string_locale_lt (int argc, Scheme_Object *argv[]);
  218. static Scheme_Object *string_gt (int argc, Scheme_Object *argv[]);
  219. static Scheme_Object *string_locale_gt (int argc, Scheme_Object *argv[]);
  220. static Scheme_Object *string_lt_eq (int argc, Scheme_Object *argv[]);
  221. static Scheme_Object *string_gt_eq (int argc, Scheme_Object *argv[]);
  222. static Scheme_Object *string_ci_lt (int argc, Scheme_Object *argv[]);
  223. static Scheme_Object *string_locale_ci_lt (int argc, Scheme_Object *argv[]);
  224. static Scheme_Object *string_ci_gt (int argc, Scheme_Object *argv[]);
  225. static Scheme_Object *string_locale_ci_gt (int argc, Scheme_Object *argv[]);
  226. static Scheme_Object *string_ci_lt_eq (int argc, Scheme_Object *argv[]);
  227. static Scheme_Object *string_ci_gt_eq (int argc, Scheme_Object *argv[]);
  228. static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[]);
  229. static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[]);
  230. static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[]);
  231. static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[]);
  232. static Scheme_Object *string_locale_upcase (int argc, Scheme_Object *argv[]);
  233. static Scheme_Object *string_locale_downcase (int argc, Scheme_Object *argv[]);
  234. static Scheme_Object *substring (int argc, Scheme_Object *argv[]);
  235. static Scheme_Object *string_append (int argc, Scheme_Object *argv[]);
  236. static Scheme_Object *string_to_list (int argc, Scheme_Object *argv[]);
  237. static Scheme_Object *list_to_string (int argc, Scheme_Object *argv[]);
  238. static Scheme_Object *string_copy (int argc, Scheme_Object *argv[]);
  239. static Scheme_Object *string_copy_bang (int argc, Scheme_Object *argv[]);
  240. static Scheme_Object *string_fill (int argc, Scheme_Object *argv[]);
  241. static Scheme_Object *string_to_immutable (int argc, Scheme_Object *argv[]);
  242. static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[]);
  243. static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[]);
  244. static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[]);
  245. static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[]);
  246. static Scheme_Object *make_shared_byte_string (int argc, Scheme_Object *argv[]);
  247. static Scheme_Object *shared_byte_string (int argc, Scheme_Object *argv[]);
  248. static Scheme_Object *make_byte_string (int argc, Scheme_Object *argv[]);
  249. static Scheme_Object *byte_string (int argc, Scheme_Object *argv[]);
  250. static Scheme_Object *byte_p (int argc, Scheme_Object *argv[]);
  251. static Scheme_Object *byte_string_p (int argc, Scheme_Object *argv[]);
  252. static Scheme_Object *byte_string_length (int argc, Scheme_Object *argv[]);
  253. static Scheme_Object *byte_string_eq (int argc, Scheme_Object *argv[]);
  254. static Scheme_Object *byte_string_lt (int argc, Scheme_Object *argv[]);
  255. static Scheme_Object *byte_string_gt (int argc, Scheme_Object *argv[]);
  256. static Scheme_Object *byte_substring (int argc, Scheme_Object *argv[]);
  257. static Scheme_Object *byte_string_append (int argc, Scheme_Object *argv[]);
  258. static Scheme_Object *byte_string_to_list (int argc, Scheme_Object *argv[]);
  259. static Scheme_Object *list_to_byte_string (int argc, Scheme_Object *argv[]);
  260. static Scheme_Object *byte_string_copy (int argc, Scheme_Object *argv[]);
  261. static Scheme_Object *byte_string_copy_bang (int argc, Scheme_Object *argv[]);
  262. static Scheme_Object *byte_string_fill (int argc, Scheme_Object *argv[]);
  263. static Scheme_Object *byte_string_to_immutable (int argc, Scheme_Object *argv[]);
  264. static Scheme_Object *byte_string_utf8_index (int argc, Scheme_Object *argv[]);
  265. static Scheme_Object *byte_string_utf8_ref (int argc, Scheme_Object *argv[]);
  266. static Scheme_Object *byte_string_utf8_length (int argc, Scheme_Object *argv[]);
  267. static Scheme_Object *byte_string_to_char_string (int argc, Scheme_Object *argv[]);
  268. static Scheme_Object *byte_string_to_char_string_locale (int argc, Scheme_Object *argv[]);
  269. static Scheme_Object *byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[]);
  270. static Scheme_Object *char_string_to_byte_string (int argc, Scheme_Object *argv[]);
  271. static Scheme_Object *char_string_to_byte_string_locale (int argc, Scheme_Object *argv[]);
  272. static Scheme_Object *char_string_to_byte_string_latin1 (int argc, Scheme_Object *argv[]);
  273. static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[]);
  274. static Scheme_Object *version(int argc, Scheme_Object *argv[]);
  275. static Scheme_Object *format(int argc, Scheme_Object *argv[]);
  276. static Scheme_Object *sch_printf(int argc, Scheme_Object *argv[]);
  277. static Scheme_Object *sch_eprintf(int argc, Scheme_Object *argv[]);
  278. static Scheme_Object *sch_fprintf(int argc, Scheme_Object *argv[]);
  279. static Scheme_Object *banner(int argc, Scheme_Object *argv[]);
  280. static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[]);
  281. static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[]);
  282. static Scheme_Object *system_type(int argc, Scheme_Object *argv[]);
  283. static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[]);
  284. static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[]);
  285. static Scheme_Object *current_locale(int argc, Scheme_Object *argv[]);
  286. static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[]);
  287. static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[]);
  288. static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object *argv[]);
  289. static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object *argv[]);
  290. static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[]);
  291. static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[]);
  292. static Scheme_Object *byte_converter_p(int argc, Scheme_Object *argv[]);
  293. #ifdef MZ_PRECISE_GC
  294. static void register_traversers(void);
  295. #endif
  296. static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
  297. static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
  298. static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2);
  299. XFORM_NONGCING static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
  300. unsigned int *us, intptr_t dstart, intptr_t dend,
  301. intptr_t *ipos, intptr_t *jpos,
  302. char compact, char utf16,
  303. int *state, int might_continue, int permissive);
  304. XFORM_NONGCING static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
  305. unsigned char *s, intptr_t dstart, intptr_t dend,
  306. intptr_t *_ipos, intptr_t *_opos, char utf16);
  307. static char *string_to_from_locale(int to_bytes,
  308. char *in, intptr_t delta, intptr_t len,
  309. intptr_t *olen, int perm,
  310. int *no_cvt);
  311. #define portable_isspace(x) (((x) < 128) && isspace(x))
  312. ROSYM static Scheme_Object *sys_symbol;
  313. ROSYM static Scheme_Object *platform_3m_path, *platform_cgc_path;
  314. READ_ONLY static Scheme_Object *zero_length_char_string;
  315. READ_ONLY static Scheme_Object *zero_length_byte_string;
  316. SHARED_OK static Scheme_Hash_Table *putenv_str_table;
  317. SHARED_OK static char *embedding_banner;
  318. SHARED_OK static Scheme_Object *vers_str;
  319. SHARED_OK static Scheme_Object *banner_str;
  320. READ_ONLY static Scheme_Object *complete_symbol, *continues_symbol, *aborts_symbol, *error_symbol;
  321. void
  322. scheme_init_string (Scheme_Env *env)
  323. {
  324. Scheme_Object *p;
  325. REGISTER_SO(sys_symbol);
  326. sys_symbol = scheme_intern_symbol(SYSTEM_TYPE_NAME);
  327. REGISTER_SO(zero_length_char_string);
  328. REGISTER_SO(zero_length_byte_string);
  329. zero_length_char_string = scheme_alloc_char_string(0, 0);
  330. zero_length_byte_string = scheme_alloc_byte_string(0, 0);
  331. REGISTER_SO(complete_symbol);
  332. REGISTER_SO(continues_symbol);
  333. REGISTER_SO(aborts_symbol);
  334. REGISTER_SO(error_symbol);
  335. complete_symbol = scheme_intern_symbol("complete");
  336. continues_symbol = scheme_intern_symbol("continues");
  337. aborts_symbol = scheme_intern_symbol("aborts");
  338. error_symbol = scheme_intern_symbol("error");
  339. REGISTER_SO(platform_3m_path);
  340. #ifdef UNIX_FILE_SYSTEM
  341. # define MZ3M_SUBDIR "/3m"
  342. #else
  343. # ifdef DOS_FILE_SYSTEM
  344. # define MZ3M_SUBDIR "\\3m"
  345. # else
  346. # define MZ3M_SUBDIR ":3m"
  347. # endif
  348. #endif
  349. REGISTER_SO(platform_3m_path);
  350. REGISTER_SO(platform_cgc_path);
  351. platform_cgc_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH);
  352. platform_3m_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH MZ3M_SUBDIR);
  353. REGISTER_SO(putenv_str_table);
  354. REGISTER_SO(embedding_banner);
  355. REGISTER_SO(vers_str);
  356. REGISTER_SO(banner_str);
  357. vers_str = scheme_make_utf8_string(scheme_version());
  358. SCHEME_SET_CHAR_STRING_IMMUTABLE(vers_str);
  359. banner_str = scheme_make_utf8_string(scheme_banner());
  360. SCHEME_SET_CHAR_STRING_IMMUTABLE(banner_str);
  361. p = scheme_make_folding_prim(string_p, "string?", 1, 1, 1);
  362. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_UNARY_INLINED;
  363. scheme_add_global_constant("string?", p, env);
  364. scheme_add_global_constant("make-string",
  365. scheme_make_immed_prim(make_string,
  366. "make-string",
  367. 1, 2),
  368. env);
  369. scheme_add_global_constant("string",
  370. scheme_make_immed_prim(string,
  371. "string",
  372. 0, -1),
  373. env);
  374. scheme_add_global_constant("string-length",
  375. scheme_make_folding_prim(string_length,
  376. "string-length",
  377. 1, 1, 1),
  378. env);
  379. p = scheme_make_immed_prim(scheme_checked_string_ref, "string-ref", 2, 2);
  380. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
  381. scheme_add_global_constant("string-ref", p, env);
  382. p = scheme_make_immed_prim(scheme_checked_string_set, "string-set!", 3, 3);
  383. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  384. scheme_add_global_constant("string-set!", p, env);
  385. scheme_add_global_constant("string=?",
  386. scheme_make_immed_prim(string_eq,
  387. "string=?",
  388. 2, -1),
  389. env);
  390. scheme_add_global_constant("string-locale=?",
  391. scheme_make_immed_prim(string_locale_eq,
  392. "string-locale=?",
  393. 2, -1),
  394. env);
  395. scheme_add_global_constant("string-ci=?",
  396. scheme_make_immed_prim(string_ci_eq,
  397. "string-ci=?",
  398. 2, -1),
  399. env);
  400. scheme_add_global_constant("string-locale-ci=?",
  401. scheme_make_immed_prim(string_locale_ci_eq,
  402. "string-locale-ci=?",
  403. 2, -1),
  404. env);
  405. scheme_add_global_constant("string<?",
  406. scheme_make_immed_prim(string_lt,
  407. "string<?",
  408. 2, -1),
  409. env);
  410. scheme_add_global_constant("string-locale<?",
  411. scheme_make_immed_prim(string_locale_lt,
  412. "string-locale<?",
  413. 2, -1),
  414. env);
  415. scheme_add_global_constant("string>?",
  416. scheme_make_immed_prim(string_gt,
  417. "string>?",
  418. 2, -1),
  419. env);
  420. scheme_add_global_constant("string-locale>?",
  421. scheme_make_immed_prim(string_locale_gt,
  422. "string-locale>?",
  423. 2, -1),
  424. env);
  425. scheme_add_global_constant("string<=?",
  426. scheme_make_immed_prim(string_lt_eq,
  427. "string<=?",
  428. 2, -1),
  429. env);
  430. scheme_add_global_constant("string>=?",
  431. scheme_make_immed_prim(string_gt_eq,
  432. "string>=?",
  433. 2, -1),
  434. env);
  435. scheme_add_global_constant("string-ci<?",
  436. scheme_make_immed_prim(string_ci_lt,
  437. "string-ci<?",
  438. 2, -1),
  439. env);
  440. scheme_add_global_constant("string-locale-ci<?",
  441. scheme_make_immed_prim(string_locale_ci_lt,
  442. "string-locale-ci<?",
  443. 2, -1),
  444. env);
  445. scheme_add_global_constant("string-ci>?",
  446. scheme_make_immed_prim(string_ci_gt,
  447. "string-ci>?",
  448. 2, -1),
  449. env);
  450. scheme_add_global_constant("string-locale-ci>?",
  451. scheme_make_immed_prim(string_locale_ci_gt,
  452. "string-locale-ci>?",
  453. 2, -1),
  454. env);
  455. scheme_add_global_constant("string-ci<=?",
  456. scheme_make_immed_prim(string_ci_lt_eq,
  457. "string-ci<=?",
  458. 2, -1),
  459. env);
  460. scheme_add_global_constant("string-ci>=?",
  461. scheme_make_immed_prim(string_ci_gt_eq,
  462. "string-ci>=?",
  463. 2, -1),
  464. env);
  465. scheme_add_global_constant("substring",
  466. scheme_make_immed_prim(substring,
  467. "substring",
  468. 2, 3),
  469. env);
  470. scheme_add_global_constant("string-append",
  471. scheme_make_immed_prim(string_append,
  472. "string-append",
  473. 0, -1),
  474. env);
  475. scheme_add_global_constant("string->list",
  476. scheme_make_immed_prim(string_to_list,
  477. "string->list",
  478. 1, 1),
  479. env);
  480. scheme_add_global_constant("list->string",
  481. scheme_make_immed_prim(list_to_string,
  482. "list->string",
  483. 1, 1),
  484. env);
  485. scheme_add_global_constant("string-copy",
  486. scheme_make_immed_prim(string_copy,
  487. "string-copy",
  488. 1, 1),
  489. env);
  490. scheme_add_global_constant("string-copy!",
  491. scheme_make_immed_prim(string_copy_bang,
  492. "string-copy!",
  493. 3, 5),
  494. env);
  495. scheme_add_global_constant("string-fill!",
  496. scheme_make_immed_prim(string_fill,
  497. "string-fill!",
  498. 2, 2),
  499. env);
  500. scheme_add_global_constant("string->immutable-string",
  501. scheme_make_immed_prim(string_to_immutable,
  502. "string->immutable-string",
  503. 1, 1),
  504. env);
  505. scheme_add_global_constant("string-normalize-nfc",
  506. scheme_make_immed_prim(string_normalize_c,
  507. "string-normalize-nfc",
  508. 1, 1),
  509. env);
  510. scheme_add_global_constant("string-normalize-nfkc",
  511. scheme_make_immed_prim(string_normalize_kc,
  512. "string-normalize-nfkc",
  513. 1, 1),
  514. env);
  515. scheme_add_global_constant("string-normalize-nfd",
  516. scheme_make_immed_prim(string_normalize_d,
  517. "string-normalize-nfd",
  518. 1, 1),
  519. env);
  520. scheme_add_global_constant("string-normalize-nfkd",
  521. scheme_make_immed_prim(string_normalize_kd,
  522. "string-normalize-nfkd",
  523. 1, 1),
  524. env);
  525. scheme_add_global_constant("string-upcase",
  526. scheme_make_immed_prim(string_upcase,
  527. "string-upcase",
  528. 1, 1),
  529. env);
  530. scheme_add_global_constant("string-downcase",
  531. scheme_make_immed_prim(string_downcase,
  532. "string-downcase",
  533. 1, 1),
  534. env);
  535. scheme_add_global_constant("string-titlecase",
  536. scheme_make_immed_prim(string_titlecase,
  537. "string-titlecase",
  538. 1, 1),
  539. env);
  540. scheme_add_global_constant("string-foldcase",
  541. scheme_make_immed_prim(string_foldcase,
  542. "string-foldcase",
  543. 1, 1),
  544. env);
  545. scheme_add_global_constant("string-locale-upcase",
  546. scheme_make_immed_prim(string_locale_upcase,
  547. "string-locale-upcase",
  548. 1, 1),
  549. env);
  550. scheme_add_global_constant("string-locale-downcase",
  551. scheme_make_immed_prim(string_locale_downcase,
  552. "string-locale-downcase",
  553. 1, 1),
  554. env);
  555. scheme_add_global_constant("current-locale",
  556. scheme_register_parameter(current_locale,
  557. "current-locale",
  558. MZCONFIG_LOCALE),
  559. env);
  560. scheme_add_global_constant("locale-string-encoding",
  561. scheme_make_immed_prim(locale_string_encoding,
  562. "locale-string-encoding",
  563. 0, 0),
  564. env);
  565. scheme_add_global_constant("system-language+country",
  566. scheme_make_immed_prim(system_language_country,
  567. "system-language+country",
  568. 0, 0),
  569. env);
  570. scheme_add_global_constant("bytes-converter?",
  571. scheme_make_immed_prim(byte_converter_p,
  572. "bytes-converter?",
  573. 1, 1),
  574. env);
  575. scheme_add_global_constant("bytes-convert",
  576. scheme_make_prim_w_arity2(byte_string_convert,
  577. "bytes-convert",
  578. 1, 7,
  579. 3, 3),
  580. env);
  581. scheme_add_global_constant("bytes-convert-end",
  582. scheme_make_prim_w_arity2(byte_string_convert_end,
  583. "bytes-convert-end",
  584. 0, 3,
  585. 2, 2),
  586. env);
  587. scheme_add_global_constant("bytes-open-converter",
  588. scheme_make_immed_prim(byte_string_open_converter,
  589. "bytes-open-converter",
  590. 2, 2),
  591. env);
  592. scheme_add_global_constant("bytes-close-converter",
  593. scheme_make_immed_prim(byte_string_close_converter,
  594. "bytes-close-converter",
  595. 1, 1),
  596. env);
  597. scheme_add_global_constant("format",
  598. scheme_make_noncm_prim(format,
  599. "format",
  600. 1, -1),
  601. env);
  602. scheme_add_global_constant("printf",
  603. scheme_make_noncm_prim(sch_printf,
  604. "printf",
  605. 1, -1),
  606. env);
  607. scheme_add_global_constant("eprintf",
  608. scheme_make_noncm_prim(sch_eprintf,
  609. "eprintf",
  610. 1, -1),
  611. env);
  612. scheme_add_global_constant("fprintf",
  613. scheme_make_noncm_prim(sch_fprintf,
  614. "fprintf",
  615. 2, -1),
  616. env);
  617. scheme_add_global_constant("byte?",
  618. scheme_make_folding_prim(byte_p,
  619. "byte?",
  620. 1, 1, 1),
  621. env);
  622. p = scheme_make_folding_prim(byte_string_p, "bytes?", 1, 1, 1);
  623. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  624. scheme_add_global_constant("bytes?", p, env);
  625. scheme_add_global_constant("make-bytes",
  626. scheme_make_immed_prim(make_byte_string,
  627. "make-bytes",
  628. 1, 2),
  629. env);
  630. scheme_add_global_constant("bytes",
  631. scheme_make_immed_prim(byte_string,
  632. "bytes",
  633. 0, -1),
  634. env);
  635. GLOBAL_PRIM_W_ARITY("make-shared-bytes", make_shared_byte_string, 1, 2, env);
  636. GLOBAL_PRIM_W_ARITY("shared-bytes", shared_byte_string, 0, -1, env);
  637. scheme_add_global_constant("bytes-length",
  638. scheme_make_folding_prim(byte_string_length,
  639. "bytes-length",
  640. 1, 1, 1),
  641. env);
  642. p = scheme_make_immed_prim(scheme_checked_byte_string_ref, "bytes-ref", 2, 2);
  643. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
  644. scheme_add_global_constant("bytes-ref", p, env);
  645. p = scheme_make_immed_prim(scheme_checked_byte_string_set, "bytes-set!", 3, 3);
  646. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  647. scheme_add_global_constant("bytes-set!", p, env);
  648. scheme_add_global_constant("bytes=?",
  649. scheme_make_immed_prim(byte_string_eq,
  650. "bytes=?",
  651. 2, -1),
  652. env);
  653. scheme_add_global_constant("bytes<?",
  654. scheme_make_immed_prim(byte_string_lt,
  655. "bytes<?",
  656. 2, -1),
  657. env);
  658. scheme_add_global_constant("bytes>?",
  659. scheme_make_immed_prim(byte_string_gt,
  660. "bytes>?",
  661. 2, -1),
  662. env);
  663. scheme_add_global_constant("subbytes",
  664. scheme_make_immed_prim(byte_substring,
  665. "subbytes",
  666. 2, 3),
  667. env);
  668. scheme_add_global_constant("bytes-append",
  669. scheme_make_immed_prim(byte_string_append,
  670. "bytes-append",
  671. 0, -1),
  672. env);
  673. scheme_add_global_constant("bytes->list",
  674. scheme_make_immed_prim(byte_string_to_list,
  675. "bytes->list",
  676. 1, 1),
  677. env);
  678. scheme_add_global_constant("list->bytes",
  679. scheme_make_immed_prim(list_to_byte_string,
  680. "list->bytes",
  681. 1, 1),
  682. env);
  683. scheme_add_global_constant("bytes-copy",
  684. scheme_make_immed_prim(byte_string_copy,
  685. "bytes-copy",
  686. 1, 1),
  687. env);
  688. scheme_add_global_constant("bytes-copy!",
  689. scheme_make_immed_prim(byte_string_copy_bang,
  690. "bytes-copy!",
  691. 3, 5),
  692. env);
  693. scheme_add_global_constant("bytes-fill!",
  694. scheme_make_immed_prim(byte_string_fill,
  695. "bytes-fill!",
  696. 2, 2),
  697. env);
  698. scheme_add_global_constant("bytes->immutable-bytes",
  699. scheme_make_immed_prim(byte_string_to_immutable,
  700. "bytes->immutable-bytes",
  701. 1, 1),
  702. env);
  703. scheme_add_global_constant("bytes-utf-8-index",
  704. scheme_make_immed_prim(byte_string_utf8_index,
  705. "bytes-utf-8-index",
  706. 2, 4),
  707. env);
  708. scheme_add_global_constant("bytes-utf-8-length",
  709. scheme_make_immed_prim(byte_string_utf8_length,
  710. "bytes-utf-8-length",
  711. 1, 4),
  712. env);
  713. scheme_add_global_constant("bytes-utf-8-ref",
  714. scheme_make_immed_prim(byte_string_utf8_ref,
  715. "bytes-utf-8-ref",
  716. 2, 4),
  717. env);
  718. scheme_add_global_constant("bytes->string/utf-8",
  719. scheme_make_immed_prim(byte_string_to_char_string,
  720. "bytes->string/utf-8",
  721. 1, 4),
  722. env);
  723. scheme_add_global_constant("bytes->string/locale",
  724. scheme_make_immed_prim(byte_string_to_char_string_locale,
  725. "bytes->string/locale",
  726. 1, 4),
  727. env);
  728. scheme_add_global_constant("bytes->string/latin-1",
  729. scheme_make_immed_prim(byte_string_to_char_string_latin1,
  730. "bytes->string/latin-1",
  731. 1, 4),
  732. env);
  733. scheme_add_global_constant("string->bytes/utf-8",
  734. scheme_make_immed_prim(char_string_to_byte_string,
  735. "string->bytes/utf-8",
  736. 1, 4),
  737. env);
  738. scheme_add_global_constant("string->bytes/locale",
  739. scheme_make_immed_prim(char_string_to_byte_string_locale,
  740. "string->bytes/locale",
  741. 1, 4),
  742. env);
  743. scheme_add_global_constant("string->bytes/latin-1",
  744. scheme_make_immed_prim(char_string_to_byte_string_latin1,
  745. "string->bytes/latin-1",
  746. 1, 4),
  747. env);
  748. scheme_add_global_constant("string-utf-8-length",
  749. scheme_make_immed_prim(char_string_utf8_length,
  750. "string-utf-8-length",
  751. 1, 3),
  752. env);
  753. /* In principle, `version' could be foldable, but it invites
  754. more problems than it solves... */
  755. scheme_add_global_constant("version",
  756. scheme_make_immed_prim(version,
  757. "version",
  758. 0, 0),
  759. env);
  760. scheme_add_global_constant("banner",
  761. scheme_make_immed_prim(banner,
  762. "banner",
  763. 0, 0),
  764. env);
  765. scheme_add_global_constant("getenv",
  766. scheme_make_immed_prim(sch_getenv,
  767. "getenv",
  768. 1, 1),
  769. env);
  770. scheme_add_global_constant("putenv",
  771. scheme_make_immed_prim(sch_putenv,
  772. "putenv",
  773. 2, 2),
  774. env);
  775. /* Don't make these folding, since they're platform-specific: */
  776. scheme_add_global_constant("system-type",
  777. scheme_make_immed_prim(system_type,
  778. "system-type",
  779. 0, 1),
  780. env);
  781. scheme_add_global_constant("system-library-subpath",
  782. scheme_make_immed_prim(system_library_subpath,
  783. "system-library-subpath",
  784. 0, 1),
  785. env);
  786. scheme_add_global_constant("current-command-line-arguments",
  787. scheme_register_parameter(cmdline_args,
  788. "current-command-line-arguments",
  789. MZCONFIG_CMDLINE_ARGS),
  790. env);
  791. #ifdef MZ_PRECISE_GC
  792. register_traversers();
  793. #endif
  794. }
  795. void scheme_init_string_places(void) {
  796. REGISTER_SO(current_locale_name_ptr);
  797. current_locale_name_ptr = "xxxx\0\0\0\0";
  798. }
  799. /**********************************************************************/
  800. /* UTF-8 char constructors */
  801. /**********************************************************************/
  802. Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, intptr_t d, intptr_t len)
  803. {
  804. intptr_t ulen;
  805. mzchar *us;
  806. if (len) {
  807. ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
  808. NULL, 0, -1,
  809. NULL, 0 /* not UTF-16 */, 0xFFFD);
  810. us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
  811. scheme_utf8_decode((unsigned char *)chars, d, d + len,
  812. us, 0, -1,
  813. NULL, 0 /* not UTF-16 */, 0xFFFD);
  814. us[ulen] = 0;
  815. } else {
  816. us = (mzchar *)"\0\0\0";
  817. ulen = 0;
  818. }
  819. return scheme_make_sized_offset_char_string(us, 0, ulen, 0);
  820. }
  821. Scheme_Object *
  822. scheme_make_sized_utf8_string(char *chars, intptr_t len)
  823. {
  824. return scheme_make_sized_offset_utf8_string(chars, 0, len);
  825. }
  826. Scheme_Object *
  827. scheme_make_immutable_sized_utf8_string(char *chars, intptr_t len)
  828. {
  829. Scheme_Object *s;
  830. s = scheme_make_sized_offset_utf8_string(chars, 0, len);
  831. if (len)
  832. SCHEME_SET_CHAR_STRING_IMMUTABLE(s);
  833. return s;
  834. }
  835. Scheme_Object *
  836. scheme_make_utf8_string(const char *chars)
  837. {
  838. return scheme_make_sized_offset_utf8_string((char *)chars, 0, -1);
  839. }
  840. Scheme_Object *
  841. scheme_make_locale_string(const char *chars)
  842. {
  843. return scheme_byte_string_to_char_string_locale(scheme_make_byte_string((char *)chars));
  844. }
  845. /**********************************************************************/
  846. /* index helpers */
  847. /**********************************************************************/
  848. void scheme_out_of_string_range(const char *name, const char *which,
  849. Scheme_Object *i, Scheme_Object *s,
  850. intptr_t start, intptr_t len)
  851. {
  852. int is_byte;
  853. is_byte = SCHEME_BYTE_STRINGP(s);
  854. if (len) {
  855. char *sstr;
  856. intptr_t slen;
  857. sstr = scheme_make_provided_string(s, 2, &slen);
  858. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  859. "%s: %sindex %s out of range [%d, %d] for %s%s: %t",
  860. name, which,
  861. scheme_make_provided_string(i, 2, NULL),
  862. ((start < 0) ? 0 : start),
  863. ((start < 0) ? (len - 1) : len),
  864. is_byte ? "byte-" : "",
  865. SCHEME_CHAPERONE_VECTORP(s) ? "vector" : "string",
  866. sstr, slen);
  867. } else {
  868. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  869. "%s: %sindex %s out of range for empty %s%s",
  870. name, which,
  871. scheme_make_provided_string(i, 0, NULL),
  872. is_byte ? "byte-" : "",
  873. SCHEME_CHAPERONE_VECTORP(s) ? "vector" : "string");
  874. }
  875. }
  876. intptr_t scheme_extract_index(const char *name, int pos, int argc, Scheme_Object **argv, intptr_t top, int false_ok)
  877. {
  878. intptr_t i;
  879. int is_top = 0;
  880. if (SCHEME_INTP(argv[pos])) {
  881. i = SCHEME_INT_VAL(argv[pos]);
  882. } else if (SCHEME_BIGNUMP(argv[pos])) {
  883. if (SCHEME_BIGPOS(argv[pos])) {
  884. i = top; /* out-of-bounds */
  885. is_top = 1;
  886. } else
  887. i = -1; /* negative */
  888. } else
  889. i = -1;
  890. if (!is_top && (i < 0))
  891. scheme_wrong_type(name,
  892. (false_ok ? "non-negative exact integer or #f" : "non-negative exact integer"),
  893. pos, argc, argv);
  894. return i;
  895. }
  896. void scheme_get_substring_indices(const char *name, Scheme_Object *str,
  897. int argc, Scheme_Object **argv,
  898. int spos, int fpos, intptr_t *_start, intptr_t *_finish)
  899. {
  900. intptr_t len;
  901. intptr_t start, finish;
  902. if (SCHEME_CHAPERONE_VECTORP(str))
  903. len = SCHEME_VEC_SIZE(str);
  904. else if (SCHEME_CHAR_STRINGP(str))
  905. len = SCHEME_CHAR_STRTAG_VAL(str);
  906. else
  907. len = SCHEME_BYTE_STRTAG_VAL(str);
  908. if (argc > spos)
  909. start = scheme_extract_index(name, spos, argc, argv, len + 1, 0);
  910. else
  911. start = 0;
  912. if (argc > fpos)
  913. finish = scheme_extract_index(name, fpos, argc, argv, len + 1, 0);
  914. else
  915. finish = len;
  916. if (!(start <= len)) {
  917. scheme_out_of_string_range(name, (fpos < 100) ? "starting " : "", argv[spos], str, 0, len);
  918. }
  919. if (!(finish >= start && finish <= len)) {
  920. scheme_out_of_string_range(name, "ending ", argv[fpos], str, start, len);
  921. }
  922. *_start = start;
  923. *_finish = finish;
  924. }
  925. void scheme_do_get_substring_indices(const char *name, Scheme_Object *str,
  926. int argc, Scheme_Object **argv,
  927. int spos, int fpos, intptr_t *_start, intptr_t *_finish, intptr_t len)
  928. {
  929. if (argc > spos) {
  930. if (SCHEME_INTP(argv[spos])) {
  931. intptr_t start = SCHEME_INT_VAL(argv[spos]);
  932. if ((start >= 0) && (start < len)) {
  933. *_start = start;
  934. if (argc > fpos) {
  935. intptr_t finish = SCHEME_INT_VAL(argv[fpos]);
  936. if ((finish >= start) && (finish <= len)) {
  937. *_finish = finish;
  938. return;
  939. }
  940. } else {
  941. *_finish = len;
  942. return;
  943. }
  944. }
  945. }
  946. } else {
  947. *_start = 0;
  948. *_finish = len;
  949. return;
  950. }
  951. scheme_get_substring_indices(name, str, argc, argv, spos, fpos, _start, _finish);
  952. }
  953. /**********************************************************************/
  954. /* char strings */
  955. /**********************************************************************/
  956. #define SCHEME_X_STR_VAL(x) SCHEME_CHAR_STR_VAL(x)
  957. #define SCHEME_X_STRTAG_VAL(x) SCHEME_CHAR_STRTAG_VAL(x)
  958. #define SCHEME_X_STRINGP(x) SCHEME_CHAR_STRINGP(x)
  959. #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_CHAR_STRINGP(x)
  960. #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_CHAR_STRING_IMMUTABLE(x)
  961. #define scheme_x_string_type scheme_char_string_type
  962. #define X(a, b) a##_char##b
  963. #define X_(a, b) a##_##b
  964. #define X__(a) a
  965. #define EMPTY (mzchar *)"\0\0\0"
  966. #define Xchar mzchar
  967. #define uXchar mzchar
  968. #define XSTR ""
  969. #define XSTRINGSTR "string"
  970. #define SUBXSTR "substring"
  971. #define CHARP(x) SCHEME_CHARP(x)
  972. #define CHAR_VAL(x) SCHEME_CHAR_VAL(x)
  973. #define CHAR_STR "character"
  974. #define MAKE_CHAR(x) _scheme_make_char(x)
  975. #define xstrlen scheme_char_strlen
  976. #include "strops.inc"
  977. #define GEN_STRING_COMP(name, scheme_name, comp, op, ul, size_shortcut) \
  978. static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
  979. { mzchar *s, *prev; int i, sl, pl; int falz = 0;\
  980. if (!SCHEME_CHAR_STRINGP(argv[0])) \
  981. scheme_wrong_type(scheme_name, "string", 0, argc, argv); \
  982. prev = SCHEME_CHAR_STR_VAL(argv[0]); pl = SCHEME_CHAR_STRTAG_VAL(argv[0]); \
  983. for (i = 1; i < argc; i++) { \
  984. if (!SCHEME_CHAR_STRINGP(argv[i])) \
  985. scheme_wrong_type(scheme_name, "string", i, argc, argv); \
  986. s = SCHEME_CHAR_STR_VAL(argv[i]); sl = SCHEME_CHAR_STRTAG_VAL(argv[i]); \
  987. if (!falz) if (!(comp(scheme_name, \
  988. prev, pl, \
  989. s, sl, ul, size_shortcut) op 0)) falz = 1; \
  990. prev = s; pl = sl; \
  991. } \
  992. return falz ? scheme_false : scheme_true; \
  993. }
  994. GEN_STRING_COMP(string_eq, "string=?", mz_char_strcmp, ==, 0, 1)
  995. GEN_STRING_COMP(string_lt, "string<?", mz_char_strcmp, <, 0, 0)
  996. GEN_STRING_COMP(string_gt, "string>?", mz_char_strcmp, >, 0, 0)
  997. GEN_STRING_COMP(string_lt_eq, "string<=?", mz_char_strcmp, <=, 0, 0)
  998. GEN_STRING_COMP(string_gt_eq, "string>=?", mz_char_strcmp, >=, 0, 0)
  999. GEN_STRING_COMP(string_ci_eq, "string-ci=?", mz_char_strcmp_ci, ==, 0, 0)
  1000. GEN_STRING_COMP(string_ci_lt, "string-ci<?", mz_char_strcmp_ci, <, 0, 0)
  1001. GEN_STRING_COMP(string_ci_gt, "string-ci>?", mz_char_strcmp_ci, >, 0, 0)
  1002. GEN_STRING_COMP(string_ci_lt_eq, "string-ci<=?", mz_char_strcmp_ci, <=, 0, 0)
  1003. GEN_STRING_COMP(string_ci_gt_eq, "string-ci>=?", mz_char_strcmp_ci, >=, 0, 0)
  1004. GEN_STRING_COMP(string_locale_eq, "string-locale=?", mz_char_strcmp, ==, 1, 0)
  1005. GEN_STRING_COMP(string_locale_lt, "string-locale<?", mz_char_strcmp, <, 1, 0)
  1006. GEN_STRING_COMP(string_locale_gt, "string-locale>?", mz_char_strcmp, >, 1, 0)
  1007. GEN_STRING_COMP(string_locale_ci_eq, "string-locale-ci=?", mz_char_strcmp_ci, ==, 1, 0)
  1008. GEN_STRING_COMP(string_locale_ci_lt, "string-locale-ci<?", mz_char_strcmp_ci, <, 1, 0)
  1009. GEN_STRING_COMP(string_locale_ci_gt, "string-locale-ci>?", mz_char_strcmp_ci, >, 1, 0)
  1010. /**********************************************************************/
  1011. /* byte strings */
  1012. /**********************************************************************/
  1013. #define SCHEME_BYTEP(x) ((SCHEME_INTP(x)) && (SCHEME_INT_VAL(x) >= 0) && (SCHEME_INT_VAL(x) <= 255))
  1014. #define BYTE_STR "exact integer in [0,255]"
  1015. static Scheme_Object *
  1016. byte_p(int argc, Scheme_Object *argv[])
  1017. {
  1018. return (SCHEME_BYTEP(argv[0]) ? scheme_true : scheme_false);
  1019. }
  1020. #define SCHEME_X_STR_VAL(x) SCHEME_BYTE_STR_VAL(x)
  1021. #define SCHEME_X_STRTAG_VAL(x) SCHEME_BYTE_STRTAG_VAL(x)
  1022. #define SCHEME_X_STRINGP(x) SCHEME_BYTE_STRINGP(x)
  1023. #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_BYTE_STRINGP(x)
  1024. #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_BYTE_STRING_IMMUTABLE(x)
  1025. #define scheme_x_string_type scheme_byte_string_type
  1026. #define X(a, b) a##_byte##b
  1027. #define X_(a, b) a##_byte_##b
  1028. #define X__(a) byte_##a
  1029. #define EMPTY ""
  1030. #define Xchar char
  1031. #define uXchar unsigned char
  1032. #define XSTR "byte "
  1033. #define XSTRINGSTR "bytes"
  1034. #define SUBXSTR "subbytes"
  1035. #define CHARP(x) SCHEME_BYTEP(x)
  1036. #define CHAR_VAL(x) SCHEME_INT_VAL(x)
  1037. #define CHAR_STR BYTE_STR
  1038. #define MAKE_CHAR(x) scheme_make_integer_value(x)
  1039. #define xstrlen strlen
  1040. #define GENERATING_BYTE
  1041. #include "strops.inc"
  1042. #undef GENERATING_BYTE
  1043. /* comparisons */
  1044. #define GEN_BYTE_STRING_COMP(name, scheme_name, comp, op) \
  1045. static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
  1046. { char *s, *prev; int i, sl, pl; int falz = 0;\
  1047. if (!SCHEME_BYTE_STRINGP(argv[0])) \
  1048. scheme_wrong_type(scheme_name, "byte string", 0, argc, argv); \
  1049. prev = SCHEME_BYTE_STR_VAL(argv[0]); pl = SCHEME_BYTE_STRTAG_VAL(argv[0]); \
  1050. for (i = 1; i < argc; i++) { \
  1051. if (!SCHEME_BYTE_STRINGP(argv[i])) \
  1052. scheme_wrong_type(scheme_name, "byte string", i, argc, argv); \
  1053. s = SCHEME_BYTE_STR_VAL(argv[i]); sl = SCHEME_BYTE_STRTAG_VAL(argv[i]); \
  1054. if (!falz) if (!(comp(scheme_name, \
  1055. (unsigned char *)prev, pl, \
  1056. (unsigned char *)s, sl) op 0)) falz = 1; \
  1057. prev = s; pl = sl; \
  1058. } \
  1059. return falz ? scheme_false : scheme_true; \
  1060. }
  1061. GEN_BYTE_STRING_COMP(byte_string_eq, "bytes=?", mz_strcmp, ==)
  1062. GEN_BYTE_STRING_COMP(byte_string_lt, "bytes<?", mz_strcmp, <)
  1063. GEN_BYTE_STRING_COMP(byte_string_gt, "bytes>?", mz_strcmp, >)
  1064. /**********************************************************************/
  1065. /* byte string <-> char string */
  1066. /**********************************************************************/
  1067. /************************* bytes->string *************************/
  1068. static Scheme_Object *
  1069. do_byte_string_to_char_string(const char *who,
  1070. Scheme_Object *bstr,
  1071. intptr_t istart, intptr_t ifinish,
  1072. int perm, int as_locale)
  1073. {
  1074. int i, ulen;
  1075. char *chars;
  1076. unsigned int *v;
  1077. chars = SCHEME_BYTE_STR_VAL(bstr);
  1078. ulen = utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1079. NULL, 0, -1,
  1080. NULL, NULL, 0, 0,
  1081. NULL, 0,
  1082. (perm > -1) ? 0xD800 : 0);
  1083. if (ulen < 0) {
  1084. scheme_arg_mismatch(who,
  1085. STRING_IS_NOT_UTF_8,
  1086. bstr);
  1087. }
  1088. v = (unsigned int *)scheme_malloc_atomic((ulen + 1) * sizeof(unsigned int));
  1089. utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1090. v, 0, -1,
  1091. NULL, NULL, 0, 0,
  1092. NULL, 0,
  1093. (perm > -1) ? 0xD800 : 0);
  1094. if (perm > -1) {
  1095. for (i = 0; i < ulen; i++) {
  1096. if (v[i] == 0xD800)
  1097. v[i] = perm;
  1098. }
  1099. }
  1100. v[ulen] = 0;
  1101. return scheme_make_sized_char_string(v, ulen, 0);
  1102. }
  1103. static Scheme_Object *
  1104. do_byte_string_to_char_string_locale(const char *who,
  1105. Scheme_Object *bstr,
  1106. intptr_t istart, intptr_t ifinish,
  1107. int perm)
  1108. {
  1109. char *us;
  1110. intptr_t olen;
  1111. reset_locale();
  1112. if (!iconv_ready) init_iconv();
  1113. if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
  1114. return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
  1115. if (istart < ifinish) {
  1116. int no_cvt;
  1117. us = string_to_from_locale(0, SCHEME_BYTE_STR_VAL(bstr),
  1118. istart, ifinish - istart,
  1119. &olen, perm, &no_cvt);
  1120. if (!us) {
  1121. if (no_cvt) {
  1122. return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
  1123. } else {
  1124. scheme_arg_mismatch(who,
  1125. "byte string is not a valid encoding for the current locale: ",
  1126. bstr);
  1127. }
  1128. }
  1129. ((mzchar *)us)[olen] = 0;
  1130. } else {
  1131. us = "\0\0\0";
  1132. olen = 0;
  1133. }
  1134. return scheme_make_sized_char_string((mzchar *)us, olen, 0);
  1135. }
  1136. static Scheme_Object *
  1137. do_string_to_vector(const char *who, int mode, int argc, Scheme_Object *argv[])
  1138. {
  1139. int permc;
  1140. intptr_t istart, ifinish;
  1141. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1142. scheme_wrong_type(who, "byte string", 0, argc, argv);
  1143. if ((argc < 2) || SCHEME_FALSEP(argv[1]))
  1144. permc = -1;
  1145. else {
  1146. if (!SCHEME_CHARP(argv[1]))
  1147. scheme_wrong_type(who, "character or #f", 1, argc, argv);
  1148. permc = SCHEME_CHAR_VAL(argv[1]);
  1149. }
  1150. scheme_get_substring_indices(who, argv[0], argc, argv,
  1151. 2, 3,
  1152. &istart, &ifinish);
  1153. if (mode == 0)
  1154. return do_byte_string_to_char_string(who, argv[0], istart, ifinish, permc, 0);
  1155. else if (mode == 1)
  1156. return do_byte_string_to_char_string_locale(who, argv[0], istart, ifinish, permc);
  1157. else {
  1158. /* Latin-1 */
  1159. mzchar *us;
  1160. unsigned char *s;
  1161. intptr_t i, len;
  1162. len = ifinish - istart;
  1163. s = (unsigned char *)SCHEME_BYTE_STR_VAL(argv[0]);
  1164. us = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
  1165. for (i = istart; i < ifinish; i++) {
  1166. us[i - istart] = s[i];
  1167. }
  1168. us[len] = 0;
  1169. return scheme_make_sized_char_string(us, len, 0);
  1170. }
  1171. }
  1172. static Scheme_Object *
  1173. byte_string_to_char_string (int argc, Scheme_Object *argv[])
  1174. {
  1175. return do_string_to_vector("bytes->string/utf-8", 0, argc, argv);
  1176. }
  1177. static Scheme_Object *
  1178. byte_string_to_char_string_locale (int argc, Scheme_Object *argv[])
  1179. {
  1180. return do_string_to_vector("bytes->string/locale", 1, argc, argv);
  1181. }
  1182. static Scheme_Object *
  1183. byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
  1184. {
  1185. return do_string_to_vector("bytes->string/latin-1", 2, argc, argv);
  1186. }
  1187. Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
  1188. {
  1189. return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
  1190. }
  1191. Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
  1192. {
  1193. return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
  1194. }
  1195. /************************* string->bytes *************************/
  1196. static Scheme_Object *do_char_string_to_byte_string(Scheme_Object *s, intptr_t istart, intptr_t ifinish,
  1197. int as_locale)
  1198. {
  1199. char *bs;
  1200. int slen;
  1201. slen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
  1202. NULL, 0,
  1203. 0 /* UTF-16 */);
  1204. bs = (char *)scheme_malloc_atomic(slen + 1);
  1205. scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
  1206. (unsigned char *)bs, 0,
  1207. 0 /* UTF-16 */);
  1208. bs[slen] = 0;
  1209. return scheme_make_sized_byte_string(bs, slen, 0);
  1210. }
  1211. static Scheme_Object *
  1212. do_char_string_to_byte_string_locale(const char *who,
  1213. Scheme_Object *cstr,
  1214. intptr_t istart, intptr_t ifinish,
  1215. int perm)
  1216. {
  1217. char *s;
  1218. intptr_t olen;
  1219. reset_locale();
  1220. if (!iconv_ready) init_iconv();
  1221. if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
  1222. return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
  1223. if (istart < ifinish) {
  1224. int no_cvt;
  1225. s = string_to_from_locale(1, (char *)SCHEME_CHAR_STR_VAL(cstr),
  1226. istart, ifinish - istart,
  1227. &olen, perm, &no_cvt);
  1228. if (!s) {
  1229. if (no_cvt) {
  1230. return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
  1231. } else {
  1232. scheme_arg_mismatch(who,
  1233. "string cannot be encoded for the current locale: ",
  1234. cstr);
  1235. }
  1236. }
  1237. s[olen] = 0;
  1238. } else {
  1239. s = "";
  1240. olen = 0;
  1241. }
  1242. return scheme_make_sized_byte_string(s, olen, 0);
  1243. }
  1244. Scheme_Object *scheme_char_string_to_byte_string(Scheme_Object *s)
  1245. {
  1246. return do_char_string_to_byte_string(s, 0, SCHEME_CHAR_STRLEN_VAL(s), 0);
  1247. }
  1248. Scheme_Object *scheme_char_string_to_byte_string_locale(Scheme_Object *s)
  1249. {
  1250. return do_char_string_to_byte_string_locale("s->s", s, 0, SCHEME_CHAR_STRLEN_VAL(s), '?');
  1251. }
  1252. static Scheme_Object *do_chars_to_bytes(const char *who, int mode,
  1253. int argc, Scheme_Object *argv[])
  1254. {
  1255. intptr_t istart, ifinish;
  1256. int permc;
  1257. if (!SCHEME_CHAR_STRINGP(argv[0]))
  1258. scheme_wrong_type(who, "string", 0, argc, argv);
  1259. if ((argc < 2) || SCHEME_FALSEP(argv[1]))
  1260. permc = -1;
  1261. else {
  1262. if (!SCHEME_BYTEP(argv[1]))
  1263. scheme_wrong_type(who, "byte or #f", 1, argc, argv);
  1264. permc = SCHEME_INT_VAL(argv[1]);
  1265. }
  1266. scheme_get_substring_indices(who, argv[0], argc, argv,
  1267. 2, 3, &istart, &ifinish);
  1268. if (mode == 1)
  1269. return do_char_string_to_byte_string_locale(who, argv[0], istart, ifinish, permc);
  1270. else if (mode == 0)
  1271. return do_char_string_to_byte_string(argv[0], istart, ifinish, 0);
  1272. else {
  1273. /* Latin-1 */
  1274. mzchar *us;
  1275. unsigned char *s;
  1276. intptr_t i, len;
  1277. len = ifinish - istart;
  1278. us = SCHEME_CHAR_STR_VAL(argv[0]);
  1279. s = (unsigned char *)scheme_malloc_atomic(len + 1);
  1280. for (i = istart; i < ifinish; i++) {
  1281. if (us[i] < 256)
  1282. s[i - istart] = us[i];
  1283. else if (permc >= 0) {
  1284. s[i - istart] = permc;
  1285. } else {
  1286. scheme_arg_mismatch(who,
  1287. "string cannot be encoded in Latin-1: ",
  1288. argv[0]);
  1289. }
  1290. }
  1291. s[len] = 0;
  1292. return scheme_make_sized_byte_string((char *)s, len, 0);
  1293. }
  1294. }
  1295. static Scheme_Object *char_string_to_byte_string(int argc, Scheme_Object *argv[])
  1296. {
  1297. return do_chars_to_bytes("string->bytes/utf-8", 0, argc, argv);
  1298. }
  1299. static Scheme_Object *char_string_to_byte_string_locale(int argc, Scheme_Object *argv[])
  1300. {
  1301. return do_chars_to_bytes("string->bytes/locale", 1, argc, argv);
  1302. }
  1303. static Scheme_Object *char_string_to_byte_string_latin1(int argc, Scheme_Object *argv[])
  1304. {
  1305. return do_chars_to_bytes("string->bytes/latin-1", 2, argc, argv);
  1306. }
  1307. /************************* Other *************************/
  1308. static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[])
  1309. {
  1310. intptr_t istart, ifinish, len;
  1311. if (!SCHEME_CHAR_STRINGP(argv[0]))
  1312. scheme_wrong_type("string-utf-8-length", "string", 0, argc, argv);
  1313. scheme_get_substring_indices("string-utf-8-length", argv[0], argc, argv,
  1314. 1, 2, &istart, &ifinish);
  1315. len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[0]), istart, ifinish,
  1316. NULL, 0, 0);
  1317. return scheme_make_integer(len);
  1318. }
  1319. static Scheme_Object *
  1320. byte_string_utf8_length (int argc, Scheme_Object *argv[])
  1321. {
  1322. int len, perm;
  1323. intptr_t istart, ifinish;
  1324. char *chars;
  1325. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1326. scheme_wrong_type("bytes-utf-8-length", "string", 0, argc, argv);
  1327. chars = SCHEME_BYTE_STR_VAL(argv[0]);
  1328. if ((argc > 1) && !SCHEME_FALSEP(argv[1])) {
  1329. if (!SCHEME_CHARP(argv[1]))
  1330. scheme_wrong_type("bytes-utf-8-length", "character or #f", 1, argc, argv);
  1331. perm = 1;
  1332. } else
  1333. perm = 0;
  1334. scheme_get_substring_indices("bytes-utf-8-length", argv[0], argc, argv,
  1335. 2, 3,
  1336. &istart, &ifinish);
  1337. len = scheme_utf8_decode((unsigned char *)chars, istart, ifinish,
  1338. NULL, 0, -1,
  1339. NULL, 0, perm);
  1340. if (len < 0)
  1341. return scheme_false;
  1342. else
  1343. return scheme_make_integer(len);
  1344. }
  1345. static Scheme_Object *
  1346. byte_string_utf8_index(int argc, Scheme_Object *argv[])
  1347. {
  1348. intptr_t istart, ifinish, pos = -1, opos, ipos;
  1349. int result, perm;
  1350. char *chars;
  1351. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1352. scheme_wrong_type("bytes-utf-8-index", "byte string", 0, argc, argv);
  1353. chars = SCHEME_BYTE_STR_VAL(argv[0]);
  1354. if (SCHEME_INTP(argv[1])) {
  1355. pos = SCHEME_INT_VAL(argv[1]);
  1356. } else if (SCHEME_BIGNUMP(argv[1])) {
  1357. if (SCHEME_BIGPOS(argv[1]))
  1358. pos = 0x7FFFFFFF;
  1359. }
  1360. if (pos < 0) {
  1361. scheme_wrong_type("bytes-utf-8-index", "non-negative exact integer", 1, argc, argv);
  1362. }
  1363. if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
  1364. if (!SCHEME_CHARP(argv[2]))
  1365. scheme_wrong_type("bytes-utf-8-index", "character or #f", 1, argc, argv);
  1366. perm = 1;
  1367. } else
  1368. perm = 0;
  1369. scheme_get_substring_indices("bytes-utf-8-index", argv[0], argc, argv,
  1370. 3, 4,
  1371. &istart, &ifinish);
  1372. result = utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1373. NULL, 0, pos,
  1374. &ipos, &opos,
  1375. 0, 0, NULL, 0, perm ? 1 : 0);
  1376. if (((result < 0) && (result != -3))
  1377. || ((ipos == ifinish) && (opos <= pos)))
  1378. return scheme_false;
  1379. else
  1380. return scheme_make_integer(ipos);
  1381. }
  1382. static Scheme_Object *
  1383. byte_string_utf8_ref(int argc, Scheme_Object *argv[])
  1384. {
  1385. intptr_t istart, ifinish, pos = -1, opos, ipos;
  1386. char *chars;
  1387. unsigned int us[1];
  1388. Scheme_Object *perm;
  1389. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1390. scheme_wrong_type("bytes-utf-8-ref", "byte string", 0, argc, argv);
  1391. chars = SCHEME_BYTE_STR_VAL(argv[0]);
  1392. if (SCHEME_INTP(argv[1])) {
  1393. pos = SCHEME_INT_VAL(argv[1]);
  1394. } else if (SCHEME_BIGNUMP(argv[1])) {
  1395. if (SCHEME_BIGPOS(argv[1]))
  1396. pos = 0x7FFFFFFF;
  1397. }
  1398. if (pos < 0) {
  1399. scheme_wrong_type("bytes-utf-8-ref", "non-negative exact integer", 1, argc, argv);
  1400. }
  1401. if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
  1402. if (!SCHEME_CHARP(argv[2]))
  1403. scheme_wrong_type("bytes-utf-8-ref", "character or #f", 1, argc, argv);
  1404. perm = argv[2];
  1405. } else
  1406. perm = 0;
  1407. scheme_get_substring_indices("bytes-utf-8-ref", argv[0], argc, argv,
  1408. 3, 4,
  1409. &istart, &ifinish);
  1410. if (pos > 0) {
  1411. utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1412. NULL, 0, pos,
  1413. &ipos, &opos,
  1414. 0, 0, NULL, 0, perm ? 1 : 0);
  1415. if (opos < pos)
  1416. return scheme_false;
  1417. istart = ipos;
  1418. }
  1419. utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1420. us, 0, 1,
  1421. &ipos, &opos,
  1422. 0, 0, NULL, 0, perm ? 0xFFFF : 0);
  1423. if (opos < 1)
  1424. return scheme_false;
  1425. else if (us[0] == 0xFFFF)
  1426. return perm;
  1427. else
  1428. return scheme_make_character(us[0]);
  1429. }
  1430. /********************************************************************/
  1431. /* format */
  1432. /********************************************************************/
  1433. void scheme_do_format(const char *procname, Scheme_Object *port,
  1434. const mzchar *format, int flen,
  1435. int fpos, int offset, int argc, Scheme_Object **argv)
  1436. {
  1437. int i, start, end;
  1438. int used = offset;
  1439. int num_err = 0, char_err = 0, end_ok = 0;
  1440. Scheme_Object *a[2];
  1441. if (!format) {
  1442. if (!SCHEME_CHAR_STRINGP(argv[fpos])) {
  1443. scheme_wrong_type(procname, "format-string", fpos, argc, argv);
  1444. return;
  1445. }
  1446. format = SCHEME_CHAR_STR_VAL(argv[fpos]);
  1447. flen = SCHEME_CHAR_STRTAG_VAL(argv[fpos]);
  1448. } else if (flen == -1)
  1449. flen = strlen((char *)format);
  1450. /* Check string first: */
  1451. end = flen - 1;
  1452. for (i = 0; i < end; i++) {
  1453. if (format[i] == '~') {
  1454. i++;
  1455. if (scheme_isspace(format[i])) {
  1456. /* skip spaces... */
  1457. } else switch (format[i]) {
  1458. case '~':
  1459. if (i == end)
  1460. end_ok = 1;
  1461. break;
  1462. case '%':
  1463. case 'n':
  1464. case 'N':
  1465. break;
  1466. case 'a':
  1467. case 'A':
  1468. case 's':
  1469. case 'S':
  1470. case 'v':
  1471. case 'V':
  1472. case 'e':
  1473. case 'E':
  1474. used++;
  1475. break;
  1476. case '.':
  1477. switch (format[i+1]) {
  1478. case 'a':
  1479. case 'A':
  1480. case 's':
  1481. case 'S':
  1482. case 'v':
  1483. case 'V':
  1484. break;
  1485. default:
  1486. scheme_wrong_type(procname,
  1487. "pattern-string (tag `~.' not followed by `a', `s', or `v')",
  1488. fpos, argc, argv);
  1489. break;
  1490. }
  1491. used++;
  1492. break;
  1493. case 'x':
  1494. case 'X':
  1495. case 'o':
  1496. case 'O':
  1497. case 'b':
  1498. case 'B':
  1499. if (!num_err && !char_err && (used < argc)) {
  1500. Scheme_Object *o = argv[used];
  1501. if (!SCHEME_EXACT_REALP(o)
  1502. && (!SCHEME_COMPLEXP(o)
  1503. || !SCHEME_EXACT_REALP(scheme_complex_real_part(o))))
  1504. num_err = used + 1;
  1505. }
  1506. used++;
  1507. break;
  1508. case 'c':
  1509. case 'C':
  1510. if (!num_err && !char_err && (used < argc)) {
  1511. if (!SCHEME_CHARP(argv[used]))
  1512. char_err = used + 1;
  1513. }
  1514. used++;
  1515. break;
  1516. default:
  1517. {
  1518. char buffer[64];
  1519. sprintf(buffer, "pattern-string (tag `~%c' not allowed)", format[i]);
  1520. scheme_wrong_type(procname, buffer, fpos, argc, argv);
  1521. return;
  1522. }
  1523. }
  1524. }
  1525. }
  1526. if ((format[end] == '~') && !end_ok) {
  1527. scheme_wrong_type(procname, "pattern-string (cannot end in ~)", fpos, argc, argv);
  1528. return;
  1529. }
  1530. if (used != argc) {
  1531. char *args;
  1532. intptr_t alen;
  1533. args = scheme_make_args_string("", -1, argc, argv, &alen);
  1534. if (used > argc) {
  1535. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  1536. "%s: format string requires %d arguments, given %d%t",
  1537. procname, used - offset, argc - offset, args, alen);
  1538. } else {
  1539. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  1540. "%s: format string requires %d arguments, given %d%t",
  1541. procname, used - offset, argc - offset, args, alen);
  1542. }
  1543. return;
  1544. }
  1545. if (num_err || char_err) {
  1546. int pos = (num_err ? num_err : char_err) - 1;
  1547. char *args, *bstr;
  1548. intptr_t alen;
  1549. intptr_t blen;
  1550. char *type = (num_err ? "exact-number" : "character");
  1551. Scheme_Object *bad = argv[pos];
  1552. args = scheme_make_args_string("other ", pos, argc, argv, &alen);
  1553. bstr = scheme_make_provided_string(bad, 1, &blen);
  1554. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  1555. "%s: format string requires argument of type <%s>, given %t%t",
  1556. procname, type,
  1557. bstr, blen,
  1558. args, alen);
  1559. return;
  1560. }
  1561. for (used = offset, i = start = 0; i < flen; i++) {
  1562. if (format[i] == '~') {
  1563. if (start < i) {
  1564. (void)scheme_put_char_string(procname, port, format, start, i - start);
  1565. }
  1566. i++;
  1567. if (scheme_isspace(format[i])) {
  1568. /* skip spaces (at most one newline) */
  1569. do {
  1570. if ((format[i] == '\n') || (format[i] == '\r')) {
  1571. /* got one */
  1572. if ((format[i] == '\r') && (format[i + 1] == '\n'))
  1573. i++; /* Windows-style CR-NL */
  1574. i++;
  1575. while (portable_isspace(format[i])
  1576. && !((format[i] == '\n') || (format[i] == '\r'))) {
  1577. i++;
  1578. }
  1579. break;
  1580. } else
  1581. i++;
  1582. } while (scheme_isspace(format[i]));
  1583. --i; /* back up over something */
  1584. } else switch (format[i]) {
  1585. case '~':
  1586. scheme_write_byte_string("~", 1, port);
  1587. break;
  1588. case '%':
  1589. case 'n':
  1590. case 'N':
  1591. scheme_write_byte_string("\n", 1, port);
  1592. break;
  1593. case 'c':
  1594. case 'C':
  1595. case 'a':
  1596. case 'A':
  1597. a[0] = argv[used++];
  1598. a[1] = port;
  1599. _scheme_apply(scheme_display_proc, 2, a);
  1600. break;
  1601. case 's':
  1602. case 'S':
  1603. a[0] = argv[used++];
  1604. a[1] = port;
  1605. _scheme_apply(scheme_write_proc, 2, a);
  1606. break;
  1607. case 'v':
  1608. case 'V':
  1609. a[0] = argv[used++];
  1610. a[1] = port;
  1611. _scheme_apply(scheme_print_proc, 2, a);
  1612. break;
  1613. case 'e':
  1614. case 'E':
  1615. {
  1616. intptr_t len;
  1617. char *s;
  1618. s = scheme_make_provided_string(argv[used++], 0, &len);
  1619. scheme_write_byte_string(s, len, port);
  1620. }
  1621. break;
  1622. case '.':
  1623. {
  1624. intptr_t len;
  1625. char *s;
  1626. len = scheme_get_print_width();
  1627. i++;
  1628. switch (format[i]) {
  1629. case 'a':
  1630. case 'A':
  1631. s = scheme_display_to_string_w_max(argv[used++], &len, len);
  1632. break;
  1633. case 's':
  1634. case 'S':
  1635. s = scheme_write_to_string_w_max(argv[used++], &len, len);
  1636. break;
  1637. case 'v':
  1638. case 'V':
  1639. s = scheme_print_to_string_w_max(argv[used++], &len, len);
  1640. break;
  1641. default:
  1642. s = "???";
  1643. len = 3;
  1644. }
  1645. scheme_write_byte_string(s, len, port);
  1646. }
  1647. break;
  1648. case 'x':
  1649. case 'X':
  1650. case 'o':
  1651. case 'O':
  1652. case 'b':
  1653. case 'B':
  1654. {
  1655. char *s;
  1656. int radix;
  1657. switch(format[i]) {
  1658. case 'x':
  1659. case 'X':
  1660. radix = 16;
  1661. break;
  1662. case 'o':
  1663. case 'O':
  1664. radix = 8;
  1665. break;
  1666. default:
  1667. case 'b':
  1668. case 'B':
  1669. radix = 2;
  1670. break;
  1671. }
  1672. s = scheme_number_to_string(radix, argv[used++]);
  1673. scheme_write_byte_string(s, strlen(s), port);
  1674. }
  1675. break;
  1676. }
  1677. SCHEME_USE_FUEL(1);
  1678. start = i + 1;
  1679. }
  1680. }
  1681. SCHEME_USE_FUEL(flen);
  1682. if (start < i) {
  1683. (void)scheme_put_char_string(procname, port, format, start, i - start);
  1684. }
  1685. }
  1686. char *scheme_format(mzchar *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
  1687. {
  1688. Scheme_Object *port;
  1689. port = scheme_make_byte_string_output_port();
  1690. scheme_do_format("format", port, format, flen, 0, 0, argc, argv);
  1691. return scheme_get_sized_byte_string_output(port, rlen);
  1692. }
  1693. void scheme_printf(mzchar *format, int flen, int argc, Scheme_Object **argv)
  1694. {
  1695. scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
  1696. format, flen, 0, 0, argc, argv);
  1697. }
  1698. char *scheme_format_utf8(char *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
  1699. {
  1700. mzchar *s;
  1701. intptr_t srlen;
  1702. if (flen == -1)
  1703. flen = strlen(format);
  1704. s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
  1705. if (s)
  1706. return scheme_format(s, srlen, argc, argv, rlen);
  1707. else
  1708. return "";
  1709. }
  1710. void scheme_printf_utf8(char *format, int flen, int argc, Scheme_Object **argv)
  1711. {
  1712. mzchar *s;
  1713. intptr_t srlen;
  1714. if (flen == -1)
  1715. flen = strlen(format);
  1716. s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
  1717. if (s)
  1718. scheme_printf(s, srlen, argc, argv);
  1719. }
  1720. static Scheme_Object *
  1721. format(int argc, Scheme_Object *argv[])
  1722. {
  1723. Scheme_Object *port;
  1724. char *s;
  1725. intptr_t len;
  1726. port = scheme_make_byte_string_output_port();
  1727. scheme_do_format("format", port, NULL, 0, 0, 1, argc, argv);
  1728. s = scheme_get_sized_byte_string_output(port, &len);
  1729. return scheme_make_sized_utf8_string(s, len);
  1730. }
  1731. #ifdef INSTRUMENT_PRIMITIVES
  1732. extern int g_print_prims;
  1733. #endif
  1734. static Scheme_Object *
  1735. sch_printf(int argc, Scheme_Object *argv[])
  1736. {
  1737. scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
  1738. NULL, 0, 0, 1, argc, argv);
  1739. return scheme_void;
  1740. }
  1741. static Scheme_Object *
  1742. sch_eprintf(int argc, Scheme_Object *argv[])
  1743. {
  1744. scheme_do_format("eprintf", scheme_get_param(scheme_current_config(), MZCONFIG_ERROR_PORT),
  1745. NULL, 0, 0, 1, argc, argv);
  1746. return scheme_void;
  1747. }
  1748. static Scheme_Object *
  1749. sch_fprintf(int argc, Scheme_Object *argv[])
  1750. {
  1751. if (!SCHEME_OUTPUT_PORTP(argv[0]))
  1752. scheme_wrong_type("fprintf", "output-port", 0, argc, argv);
  1753. scheme_do_format("fprintf", argv[0], NULL, 0, 1, 2, argc, argv);
  1754. return scheme_void;
  1755. }
  1756. /********************************************************************/
  1757. /* misc */
  1758. /********************************************************************/
  1759. static Scheme_Object *
  1760. version(int argc, Scheme_Object *argv[])
  1761. {
  1762. return vers_str;
  1763. }
  1764. static Scheme_Object *
  1765. banner(int argc, Scheme_Object *argv[])
  1766. {
  1767. return banner_str;
  1768. }
  1769. char *scheme_version(void)
  1770. {
  1771. return MZSCHEME_VERSION;
  1772. }
  1773. #ifdef MZ_PRECISE_GC
  1774. /* don't print " [3m]", which is the default: */
  1775. # define VERSION_SUFFIX ""
  1776. #else
  1777. # ifdef USE_SENORA_GC
  1778. # define VERSION_SUFFIX " [cgc~]"
  1779. # else
  1780. # define VERSION_SUFFIX " [cgc]"
  1781. # endif
  1782. #endif
  1783. char *scheme_banner(void)
  1784. {
  1785. if (embedding_banner)
  1786. return embedding_banner;
  1787. else
  1788. return ("Welcome to Racket"
  1789. " v" MZSCHEME_VERSION VERSION_SUFFIX
  1790. ".\n");
  1791. }
  1792. void scheme_set_banner(char *s)
  1793. {
  1794. embedding_banner = s;
  1795. }
  1796. int scheme_byte_string_has_null(Scheme_Object *o)
  1797. {
  1798. const char *s = SCHEME_BYTE_STR_VAL(o);
  1799. int i = SCHEME_BYTE_STRTAG_VAL(o);
  1800. while (i--) {
  1801. if (!s[i])
  1802. return 1;
  1803. }
  1804. return 0;
  1805. }
  1806. int scheme_any_string_has_null(Scheme_Object *o)
  1807. {
  1808. if (SCHEME_BYTE_STRINGP(o))
  1809. return scheme_byte_string_has_null(o);
  1810. else {
  1811. const mzchar *s = SCHEME_CHAR_STR_VAL(o);
  1812. int i = SCHEME_CHAR_STRTAG_VAL(o);
  1813. while (i--) {
  1814. if (!s[i])
  1815. return 1;
  1816. }
  1817. return 0;
  1818. }
  1819. }
  1820. /***********************************************************************/
  1821. /* Environment Variables */
  1822. /***********************************************************************/
  1823. #if defined(MZ_USE_PLACES) && defined(MZ_PRECISE_GC)
  1824. static char* clone_str_with_gc(const char* buffer) {
  1825. int length;
  1826. char *newbuffer;
  1827. length = strlen(buffer);
  1828. newbuffer = scheme_malloc_atomic(length+1);
  1829. memcpy(newbuffer, buffer, length+1);
  1830. return newbuffer;
  1831. }
  1832. #endif
  1833. static void create_putenv_str_table_if_needed() {
  1834. if (!putenv_str_table) {
  1835. putenv_str_table = scheme_make_hash_table(SCHEME_hash_string);
  1836. }
  1837. }
  1838. #ifndef DOS_FILE_SYSTEM
  1839. static void putenv_str_table_put_name(Scheme_Object *name, Scheme_Object *value) {
  1840. #if defined(MZ_USE_PLACES) && defined(MZ_PRECISE_GC)
  1841. void *original_gc;
  1842. Scheme_Object *name_copy;
  1843. original_gc = GC_switch_to_master_gc();
  1844. scheme_start_atomic();
  1845. name_copy = (Scheme_Object *) clone_str_with_gc((const char *) name);
  1846. create_putenv_str_table_if_needed();
  1847. scheme_hash_set(putenv_str_table, name_copy, value);
  1848. scheme_end_atomic_no_swap();
  1849. GC_switch_back_from_master(original_gc);
  1850. #else
  1851. create_putenv_str_table_if_needed();
  1852. scheme_hash_set(putenv_str_table, name, value);
  1853. #endif
  1854. }
  1855. #endif
  1856. #ifndef GETENV_FUNCTION
  1857. static void putenv_str_table_put_name_value(Scheme_Object *name, Scheme_Object *value) {
  1858. #if defined(MZ_USE_PLACES) && defined(MZ_PRECISE_GC)
  1859. void *original_gc;
  1860. Scheme_Object *name_copy;
  1861. Scheme_Object *value_copy;
  1862. original_gc = GC_switch_to_master_gc();
  1863. scheme_start_atomic();
  1864. name_copy = (Scheme_Object *) clone_str_with_gc((const char *) name);
  1865. value_copy = (Scheme_Object *) clone_str_with_gc((const char *) value);
  1866. create_putenv_str_table_if_needed();
  1867. scheme_hash_set(putenv_str_table, name_copy, value_copy);
  1868. scheme_end_atomic_no_swap();
  1869. GC_switch_back_from_master(original_gc);
  1870. #else
  1871. create_putenv_str_table_if_needed();
  1872. scheme_hash_set(putenv_str_table, name, value);
  1873. #endif
  1874. }
  1875. #endif
  1876. #if !defined(GETENV_FUNCTION) || defined(MZ_PRECISE_GC)
  1877. static Scheme_Object *putenv_str_table_get(Scheme_Object *name) {
  1878. #if defined(MZ_USE_PLACES) && defined(MZ_PRECISE_GC)
  1879. void *original_gc;
  1880. Scheme_Object *value;
  1881. original_gc = GC_switch_to_master_gc();
  1882. scheme_start_atomic();
  1883. create_putenv_str_table_if_needed();
  1884. value = scheme_hash_get(putenv_str_table, name);
  1885. scheme_end_atomic_no_swap();
  1886. GC_switch_back_from_master(original_gc);
  1887. return value;
  1888. #else
  1889. create_putenv_str_table_if_needed();
  1890. return scheme_hash_get(putenv_str_table, name);
  1891. #endif
  1892. }
  1893. #endif
  1894. static int sch_bool_getenv(const char* name);
  1895. void
  1896. scheme_init_getenv(void)
  1897. {
  1898. #ifndef GETENV_FUNCTION
  1899. FILE *f = fopen("Environment", "r");
  1900. if (f) {
  1901. Scheme_Object *p = scheme_make_file_input_port(f);
  1902. mz_jmp_buf *savebuf, newbuf;
  1903. savebuf = scheme_current_thread->error_buf;
  1904. scheme_current_thread->error_buf = &newbuf;
  1905. if (!scheme_setjmp(newbuf)) {
  1906. while (1) {
  1907. Scheme_Object *v = scheme_read(p);
  1908. if (SCHEME_EOFP(v))
  1909. break;
  1910. if (SCHEME_PAIRP(v) && SCHEME_PAIRP(SCHEME_CDR(v))
  1911. && SCHEME_NULLP(SCHEME_CDR(SCHEME_CDR(v)))) {
  1912. Scheme_Object *key = SCHEME_CAR(v);
  1913. Scheme_Object *val = SCHEME_CADR(v);
  1914. if (SCHEME_STRINGP(key) && SCHEME_STRINGP(val)) {
  1915. Scheme_Object *a[2];
  1916. a[0] = key;
  1917. a[1] = val;
  1918. sch_putenv(2, a);
  1919. v = NULL;
  1920. }
  1921. }
  1922. if (v)
  1923. scheme_signal_error("bad environment specification: %V", v);
  1924. }
  1925. }
  1926. scheme_current_thread->error_buf = savebuf;
  1927. scheme_close_input_port(p);
  1928. }
  1929. #endif
  1930. if (sch_bool_getenv("PLTNOMZJIT")) {
  1931. scheme_set_startup_use_jit(0);
  1932. }
  1933. }
  1934. #ifdef DOS_FILE_SYSTEM
  1935. # include <windows.h>
  1936. static char *dos_win_getenv(const char *name) {
  1937. int value_size;
  1938. value_size = GetEnvironmentVariable(name, NULL, 0);
  1939. if (value_size) {
  1940. char *value;
  1941. int got;
  1942. value = scheme_malloc_atomic(value_size);
  1943. got = GetEnvironmentVariable(name, value, value_size);
  1944. if (got < value_size)
  1945. value[got] = 0;
  1946. return value;
  1947. }
  1948. return NULL;
  1949. }
  1950. #endif
  1951. static int sch_bool_getenv(const char* name) {
  1952. int rc = 0;
  1953. #ifdef GETENV_FUNCTION
  1954. # ifdef DOS_FILE_SYSTEM
  1955. if (GetEnvironmentVariable(name, NULL, 0)) rc = 1;
  1956. # else
  1957. if (getenv(name)) rc = 1;
  1958. # endif
  1959. #else
  1960. if (putenv_str_table_get(name)) rc = 1;
  1961. #endif
  1962. return rc;
  1963. }
  1964. static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[])
  1965. {
  1966. char *name;
  1967. char *value;
  1968. Scheme_Object *bs;
  1969. if (!SCHEME_CHAR_STRINGP(argv[0]) || scheme_any_string_has_null(argv[0]))
  1970. scheme_wrong_type("getenv", CHAR_STRING_W_NO_NULLS, 0, argc, argv);
  1971. bs = scheme_char_string_to_byte_string_locale(argv[0]);
  1972. name = SCHEME_BYTE_STR_VAL(bs);
  1973. #ifdef GETENV_FUNCTION
  1974. # ifdef DOS_FILE_SYSTEM
  1975. value = dos_win_getenv(name);
  1976. # else
  1977. value = getenv(name);
  1978. # endif
  1979. #else
  1980. {
  1981. Scheme_Object *hash_value;
  1982. hash_value = putenv_str_table_get(name);
  1983. return hash_value ? hash_value : scheme_false;
  1984. }
  1985. #endif
  1986. return value ? scheme_make_locale_string(value) : scheme_false;
  1987. }
  1988. #ifndef DOS_FILE_SYSTEM
  1989. static int sch_unix_putenv(const char *var, const char *val, const intptr_t varlen, const intptr_t vallen) {
  1990. char *buffer;
  1991. intptr_t total_length;
  1992. total_length = varlen + vallen + 2;
  1993. #ifdef MZ_PRECISE_GC
  1994. /* Can't put moveable string into array. */
  1995. buffer = malloc(total_length);
  1996. #else
  1997. buffer = (char *)scheme_malloc_atomic(total_length);
  1998. #endif
  1999. memcpy(buffer, var, varlen);
  2000. buffer[varlen] = '=';
  2001. memcpy(buffer + varlen + 1, val, vallen + 1);
  2002. #ifdef MZ_PRECISE_GC
  2003. {
  2004. /* Free old, if in table: */
  2005. char *oldbuffer;
  2006. oldbuffer = (char *)putenv_str_table_get((Scheme_Object *)var);
  2007. if (oldbuffer)
  2008. free(oldbuffer);
  2009. }
  2010. #endif
  2011. /* if precise the buffer needs to be remembered so it can be freed */
  2012. /* if not precise the buffer needs to be rooted so it doesn't get collected prematurely */
  2013. putenv_str_table_put_name((Scheme_Object *)var, (Scheme_Object *)buffer);
  2014. return putenv(buffer);
  2015. }
  2016. #endif
  2017. static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[])
  2018. {
  2019. Scheme_Object *varbs;
  2020. Scheme_Object *valbs;
  2021. char *var;
  2022. char *val;
  2023. int rc = 0;
  2024. if (!SCHEME_CHAR_STRINGP(argv[0]) || scheme_any_string_has_null(argv[0]))
  2025. scheme_wrong_type("putenv", CHAR_STRING_W_NO_NULLS, 0, argc, argv);
  2026. if (!SCHEME_CHAR_STRINGP(argv[1]) || scheme_any_string_has_null(argv[1]))
  2027. scheme_wrong_type("putenv", CHAR_STRING_W_NO_NULLS, 1, argc, argv);
  2028. varbs = scheme_char_string_to_byte_string_locale(argv[0]);
  2029. var = SCHEME_BYTE_STR_VAL(varbs);
  2030. valbs = scheme_char_string_to_byte_string_locale(argv[1]);
  2031. val = SCHEME_BYTE_STR_VAL(valbs);
  2032. #ifdef GETENV_FUNCTION
  2033. # ifdef DOS_FILE_SYSTEM
  2034. rc = !SetEnvironmentVariable(var, val);
  2035. # else
  2036. rc = sch_unix_putenv(var, val, SCHEME_BYTE_STRLEN_VAL(varbs), SCHEME_BYTE_STRLEN_VAL(valbs));
  2037. # endif
  2038. #else
  2039. putenv_str_table_put_name_value(argv[0], argv[1]);
  2040. #endif
  2041. return rc ? scheme_false : scheme_true;
  2042. }
  2043. /***********************************************************************/
  2044. /* End Environment Variables */
  2045. /***********************************************************************/
  2046. static void machine_details(char *s);
  2047. static Scheme_Object *system_type(int argc, Scheme_Object *argv[])
  2048. {
  2049. if (argc) {
  2050. Scheme_Object *sym;
  2051. sym = scheme_intern_symbol("link");
  2052. if (SAME_OBJ(argv[0], sym)) {
  2053. #if defined(OS_X) && !defined(XONX)
  2054. return scheme_intern_symbol("framework");
  2055. #else
  2056. # ifdef DOS_FILE_SYSTEM
  2057. return scheme_intern_symbol("dll");
  2058. # else
  2059. # ifdef MZ_USES_SHARED_LIB
  2060. return scheme_intern_symbol("shared");
  2061. # else
  2062. return scheme_intern_symbol("static");
  2063. # endif
  2064. # endif
  2065. #endif
  2066. }
  2067. sym = scheme_intern_symbol("machine");
  2068. if (SAME_OBJ(argv[0], sym)) {
  2069. char buff[1024];
  2070. machine_details(buff);
  2071. return scheme_make_utf8_string(buff);
  2072. }
  2073. sym = scheme_intern_symbol("gc");
  2074. if (SAME_OBJ(argv[0], sym)) {
  2075. #ifdef MZ_PRECISE_GC
  2076. return scheme_intern_symbol("3m");
  2077. #else
  2078. return scheme_intern_symbol("cgc");
  2079. #endif
  2080. }
  2081. sym = scheme_intern_symbol("so-suffix");
  2082. if (SAME_OBJ(argv[0], sym)) {
  2083. #ifdef DOS_FILE_SYSTEM
  2084. return scheme_make_byte_string(".dll");
  2085. #else
  2086. # ifdef OS_X
  2087. return scheme_make_byte_string(".dylib");
  2088. # else
  2089. # ifdef USE_CYGWIN_SO_SUFFIX
  2090. return scheme_make_byte_string(".dll");
  2091. # else
  2092. return scheme_make_byte_string(".so");
  2093. # endif
  2094. # endif
  2095. #endif
  2096. }
  2097. sym = scheme_intern_symbol("os");
  2098. if (!SAME_OBJ(argv[0], sym)) {
  2099. scheme_wrong_type("system-type", "'os, 'link, 'machine, 'gc, or 'so-suffix", 0, argc, argv);
  2100. return NULL;
  2101. }
  2102. }
  2103. return sys_symbol;
  2104. }
  2105. static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[])
  2106. {
  2107. if (argc > 0) {
  2108. Scheme_Object *sym;
  2109. if (SCHEME_FALSEP(argv[0]))
  2110. return platform_cgc_path;
  2111. sym = scheme_intern_symbol("cgc");
  2112. if (SAME_OBJ(sym, argv[0]))
  2113. return platform_cgc_path;
  2114. sym = scheme_intern_symbol("3m");
  2115. if (SAME_OBJ(sym, argv[0]))
  2116. return platform_3m_path;
  2117. scheme_wrong_type("system-library-subpath", "'cgc, '3m, or #f", 0, argc, argv);
  2118. return NULL;
  2119. } else {
  2120. #ifdef MZ_PRECISE_GC
  2121. return platform_3m_path;
  2122. #else
  2123. return platform_cgc_path;
  2124. #endif
  2125. }
  2126. }
  2127. const char *scheme_system_library_subpath()
  2128. {
  2129. return SCHEME_PLATFORM_LIBRARY_SUBPATH;
  2130. }
  2131. /* Our own strncpy - which would be really stupid, except the one for
  2132. the implementation in Solaris 2.6 is broken (it doesn't always stop
  2133. at the null terminator). */
  2134. int scheme_strncmp(const char *a, const char *b, int len)
  2135. {
  2136. while (len-- && (*a == *b) && *a) {
  2137. a++;
  2138. b++;
  2139. }
  2140. if (len < 0)
  2141. return 0;
  2142. else
  2143. return *a - *b;
  2144. }
  2145. static Scheme_Object *ok_cmdline(int argc, Scheme_Object **argv)
  2146. {
  2147. if (SCHEME_CHAPERONE_VECTORP(argv[0])) {
  2148. Scheme_Object *vec = argv[0], *vec2, *str;
  2149. int i, size = SCHEME_VEC_SIZE(vec);
  2150. if (!size)
  2151. return vec;
  2152. for (i = 0; i < size; i++) {
  2153. if (!SCHEME_CHAR_STRINGP(SCHEME_VEC_ELS(vec)[i]))
  2154. return NULL;
  2155. }
  2156. /* Make sure vector and strings are immutable: */
  2157. vec2 = scheme_make_vector(size, NULL);
  2158. if (size)
  2159. SCHEME_SET_VECTOR_IMMUTABLE(vec2);
  2160. for (i = 0; i < size; i++) {
  2161. str = SCHEME_VEC_ELS(vec)[i];
  2162. if (!SCHEME_IMMUTABLE_CHAR_STRINGP(str)) {
  2163. str = scheme_make_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 0);
  2164. SCHEME_SET_CHAR_STRING_IMMUTABLE(str);
  2165. }
  2166. SCHEME_VEC_ELS(vec2)[i] = str;
  2167. }
  2168. return vec2;
  2169. }
  2170. return NULL;
  2171. }
  2172. static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[])
  2173. {
  2174. return scheme_param_config("current-command-line-arguments",
  2175. scheme_make_integer(MZCONFIG_CMDLINE_ARGS),
  2176. argc, argv,
  2177. -1, ok_cmdline, "vector of strings", 1);
  2178. }
  2179. /**********************************************************************/
  2180. /* locale ops */
  2181. /**********************************************************************/
  2182. static Scheme_Object *ok_locale(int argc, Scheme_Object **argv)
  2183. {
  2184. if (SCHEME_FALSEP(argv[0]))
  2185. return argv[0];
  2186. else if (SCHEME_CHAR_STRINGP(argv[0])) {
  2187. if (SCHEME_IMMUTABLEP(argv[0]))
  2188. return argv[0];
  2189. else {
  2190. Scheme_Object *str = argv[0];
  2191. str = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 1);
  2192. return str;
  2193. }
  2194. }
  2195. return NULL;
  2196. }
  2197. static Scheme_Object *current_locale(int argc, Scheme_Object *argv[])
  2198. {
  2199. Scheme_Object *v;
  2200. v = scheme_param_config("current-locale",
  2201. scheme_make_integer(MZCONFIG_LOCALE),
  2202. argc, argv,
  2203. -1, ok_locale, "#f or string", 1);
  2204. return v;
  2205. }
  2206. static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[])
  2207. {
  2208. reset_locale();
  2209. if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on)
  2210. return scheme_make_utf8_string("UTF-8");
  2211. #if HAVE_CODESET
  2212. return scheme_make_utf8_string(nl_langinfo(CODESET));
  2213. #else
  2214. /* nl_langinfo doesn't work, so just make up something */
  2215. return scheme_make_utf8_string("UTF-8");
  2216. #endif
  2217. }
  2218. static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[])
  2219. {
  2220. #ifdef MACOS_UNICODE_SUPPORT
  2221. /* Mac OS X */
  2222. CFLocaleRef l;
  2223. CFStringRef s;
  2224. int len;
  2225. char *r;
  2226. l = CFLocaleCopyCurrent();
  2227. s = CFLocaleGetIdentifier(l);
  2228. len = CFStringGetLength(s);
  2229. r = (char *)scheme_malloc_atomic(len * 6 + 1);
  2230. CFStringGetCString(s, r, len * 6 + 1, kCFStringEncodingUTF8);
  2231. CFRelease(l);
  2232. return scheme_make_sized_utf8_string(r, 5);
  2233. #else
  2234. # ifdef WINDOWS_UNICODE_SUPPORT
  2235. /* Windows */
  2236. LCID l;
  2237. int llen, clen;
  2238. char *lang, *country, *s;
  2239. l = GetUserDefaultLCID();
  2240. llen = GetLocaleInfo(l, LOCALE_SENGLANGUAGE, NULL, 0);
  2241. lang = (char *)scheme_malloc_atomic(llen);
  2242. GetLocaleInfo(l, LOCALE_SENGLANGUAGE, lang, llen);
  2243. if (llen)
  2244. llen -= 1; /* drop nul terminator */
  2245. clen = GetLocaleInfo(l, LOCALE_SENGCOUNTRY, NULL, 0);
  2246. country = (char *)scheme_malloc_atomic(clen);
  2247. GetLocaleInfo(l, LOCALE_SENGCOUNTRY, country, clen);
  2248. if (clen)
  2249. clen -= 1; /* drop nul terminator */
  2250. s = (char *)scheme_malloc_atomic(clen + llen + 1);
  2251. memcpy(s, lang, llen);
  2252. memcpy(s + 1 + llen, country, clen);
  2253. s[llen] = '_';
  2254. return scheme_make_sized_utf8_string(s, llen + 1 + clen);
  2255. # else
  2256. /* Unix */
  2257. char *s;
  2258. s = getenv("LC_ALL");
  2259. if (!s)
  2260. s = getenv("LC_CTYPE");
  2261. if (!s)
  2262. s = getenv("LANG");
  2263. if (s) {
  2264. /* Check that the environment variable has the form
  2265. xx_XX[.ENC] */
  2266. if ((s[0] >= 'a') && (s[0] <= 'z')
  2267. && (s[1] >= 'a') && (s[1] <= 'z')
  2268. && (s[2] == '_')
  2269. && (s[3] >= 'A') && (s[3] <= 'Z')
  2270. && (s[4] >= 'A') && (s[4] <= 'Z')
  2271. && (!s[5] || s[5] == '.')) {
  2272. /* Good */
  2273. } else
  2274. s = NULL;
  2275. }
  2276. if (!s)
  2277. s = "en_US";
  2278. return scheme_make_sized_utf8_string(s, 5);
  2279. # endif
  2280. #endif
  2281. }
  2282. #ifndef DONT_USE_LOCALE
  2283. #define ICONV_ARG_CAST /* empty */
  2284. static char *do_convert(iconv_t cd,
  2285. /* if cd == -1 and either from_e or to_e can be NULL, then
  2286. reset_locale() must have been called */
  2287. const char *from_e, const char *to_e,
  2288. /* 1 => UCS-4 -> UTF-8; 2 => UTF-8 -> UCS-4; 0 => other */
  2289. int to_from_utf8,
  2290. /* in can be NULL to output just a shift; in that case,
  2291. id should be 0, too */
  2292. char *in, int id, int iilen,
  2293. char *out, int od, int iolen,
  2294. /* if grow, then reallocate when out isn't big enough */
  2295. int grow,
  2296. /* if add_end_shift, add a shift sequence to the end;
  2297. not useful if in is already NULL to indicate a shift */
  2298. int add_end_shift,
  2299. /* extra specifies the length of a terminator,
  2300. not included in iolen or *oolen */
  2301. int extra,
  2302. /* these two report actual read/wrote sizes: */
  2303. intptr_t *oilen, intptr_t *oolen,
  2304. /* status is set to
  2305. 0 for complete,
  2306. -1 for partial input,
  2307. -2 for error,
  2308. 1 for more avail */
  2309. int *status)
  2310. {
  2311. int dip, dop, close_it = 0, mz_utf8 = 0;
  2312. size_t il, ol, r;
  2313. GC_CAN_IGNORE char *ip, *op;
  2314. /* Defaults: */
  2315. *status = -1;
  2316. if (oilen)
  2317. *oilen = 0;
  2318. *oolen = 0;
  2319. if (cd == (iconv_t)-1) {
  2320. if (!iconv_ready) init_iconv();
  2321. if (mzCHK_PROC(iconv_open)) {
  2322. if (!from_e)
  2323. from_e = mz_iconv_nl_langinfo();
  2324. if (!to_e)
  2325. to_e = mz_iconv_nl_langinfo();
  2326. cd = iconv_open(to_e, from_e);
  2327. close_it = 1;
  2328. } else if (to_from_utf8) {
  2329. /* Assume UTF-8 */
  2330. mz_utf8 = 1;
  2331. }
  2332. }
  2333. if ((cd == (iconv_t)-1) && !mz_utf8) {
  2334. if (out) {
  2335. while (extra--) {
  2336. out[extra] = 0;
  2337. }
  2338. }
  2339. return out;
  2340. }
  2341. /* The converter is ready. Allocate out space, if necessary */
  2342. if (!out) {
  2343. if (iolen <= 0)
  2344. iolen = iilen;
  2345. out = (char *)scheme_malloc_atomic(iolen + extra);
  2346. od = 0;
  2347. }
  2348. /* il and ol are the number of available chars */
  2349. il = iilen;
  2350. ol = iolen;
  2351. /* dip and dop are the number of characters read so far;
  2352. we use these and NULL out the ip and op pointers
  2353. for the sake of precise GC */
  2354. dip = 0;
  2355. dop = 0;
  2356. if (!in)
  2357. add_end_shift = 0;
  2358. while (1) {
  2359. int icerr;
  2360. if (mz_utf8) {
  2361. /* Use our UTF-8 routines as if they were iconv */
  2362. if (to_from_utf8 == 1) {
  2363. /* UCS-4 -> UTF-8 */
  2364. /* We assume that in + id and iilen are mzchar-aligned */
  2365. int opos, uid, uilen;
  2366. uid = (id + dip) >> 2;
  2367. uilen = (iilen - dip) >> 2;
  2368. opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
  2369. NULL, 0,
  2370. 0);
  2371. if (opos <= iolen) {
  2372. opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
  2373. (unsigned char *)out, od + dop,
  2374. 0);
  2375. dop += opos;
  2376. dip += iilen;
  2377. icerr = 0;
  2378. r = (size_t)opos;
  2379. } else {
  2380. icerr = E2BIG;
  2381. r = (size_t)-1;
  2382. }
  2383. } else {
  2384. /* UTF-8 -> UCS-4 */
  2385. /* We assume that out + od is mzchar-aligned */
  2386. intptr_t ipos, opos;
  2387. r = utf8_decode_x((unsigned char *)in, id + dip, iilen,
  2388. (unsigned int *)out, (od + dop) >> 2, iolen >> 2,
  2389. &ipos, &opos,
  2390. 0, 0, NULL, 0, 0);
  2391. opos <<= 2;
  2392. dop = (opos - od);
  2393. dip = (ipos - id);
  2394. if ((r == -1) || (r == -2)) {
  2395. r = (size_t)-1;
  2396. icerr = EILSEQ;
  2397. } else if (r == -3) {
  2398. icerr = E2BIG;
  2399. r = (size_t)-1;
  2400. } else
  2401. icerr = 0;
  2402. }
  2403. } else {
  2404. ip = in XFORM_OK_PLUS id + dip;
  2405. op = out XFORM_OK_PLUS od + dop;
  2406. r = iconv(cd, ICONV_ARG_CAST &ip, &il, &op, &ol);
  2407. dip = ip - (in XFORM_OK_PLUS id);
  2408. dop = op - (out XFORM_OK_PLUS od);
  2409. ip = op = NULL;
  2410. icerr = ICONV_errno;
  2411. }
  2412. /* Record how many chars processed, now */
  2413. if (oilen)
  2414. *oilen = dip;
  2415. *oolen = dop;
  2416. /* Got all the chars? */
  2417. if (r == (size_t)-1) {
  2418. if (icerr == E2BIG) {
  2419. if (grow) {
  2420. /* Double the string size and try again */
  2421. char *naya;
  2422. naya = (char *)scheme_malloc_atomic((iolen * 2) + extra);
  2423. memcpy(naya, out + od, *oolen);
  2424. ol += iolen;
  2425. iolen += iolen;
  2426. out = naya;
  2427. od = 0;
  2428. } else {
  2429. *status = 1;
  2430. if (close_it)
  2431. iconv_close(cd);
  2432. while (extra--) {
  2433. out[od + dop + extra] = 0;
  2434. }
  2435. return out;
  2436. }
  2437. } else {
  2438. /* Either EINVAL (premature end) or EILSEQ (bad sequence) */
  2439. if (icerr == EILSEQ)
  2440. *status = -2;
  2441. if (close_it)
  2442. iconv_close(cd);
  2443. while (extra--) {
  2444. out[od + dop + extra] = 0;
  2445. }
  2446. return out;
  2447. }
  2448. } else {
  2449. /* All done... */
  2450. if (add_end_shift) {
  2451. add_end_shift = 0;
  2452. in = NULL;
  2453. dip = 0;
  2454. id = 0;
  2455. il = 0; /* should be redundant */
  2456. oilen = NULL; /* so it doesn't get set to 0 */
  2457. } else {
  2458. *status = 0;
  2459. if (close_it)
  2460. iconv_close(cd);
  2461. while (extra--) {
  2462. out[od + dop + extra] = 0;
  2463. }
  2464. return out;
  2465. }
  2466. }
  2467. }
  2468. }
  2469. #define MZ_SC_BUF_SIZE 32
  2470. static char *string_to_from_locale(int to_bytes,
  2471. char *in, intptr_t delta, intptr_t len,
  2472. intptr_t *olen, int perm,
  2473. int *no_cvt)
  2474. /* Call this function only when iconv is available, and only when
  2475. reset_locale() has been called */
  2476. {
  2477. Scheme_Object *parts = scheme_null, *one;
  2478. char *c;
  2479. intptr_t clen, used;
  2480. int status;
  2481. iconv_t cd;
  2482. if (!iconv_ready) init_iconv();
  2483. if (to_bytes)
  2484. cd = iconv_open(mz_iconv_nl_langinfo(), MZ_UCS4_NAME);
  2485. else
  2486. cd = iconv_open(MZ_UCS4_NAME, mz_iconv_nl_langinfo());
  2487. if (cd == (iconv_t)-1) {
  2488. *no_cvt = 1;
  2489. return NULL;
  2490. }
  2491. *no_cvt = 0;
  2492. while (len) {
  2493. /* We might have conversion errors... */
  2494. c = do_convert(cd, NULL, NULL, 0,
  2495. (char *)in, (to_bytes ? 4 : 1) * delta, (to_bytes ? 4 : 1) * len,
  2496. NULL, 0, (to_bytes ? 1 : 4) * (len + 1),
  2497. 1 /* grow */, 1, (to_bytes ? 1 : 4) /* terminator size */,
  2498. &used, &clen,
  2499. &status);
  2500. if (to_bytes)
  2501. used >>= 2;
  2502. if ((perm < 0) && (used < len)) {
  2503. iconv_close(cd);
  2504. return NULL;
  2505. }
  2506. delta += used;
  2507. len -= used;
  2508. if (!len && SCHEME_NULLP(parts)) {
  2509. if (to_bytes) {
  2510. *olen = clen;
  2511. c[*olen] = 0;
  2512. } else {
  2513. *olen = (clen >> 2);
  2514. ((mzchar *)c)[*olen] = 0;
  2515. }
  2516. iconv_close(cd);
  2517. return c;
  2518. }
  2519. /* We can get here if there was some conversion error at some
  2520. point. We're building up a list of parts. */
  2521. if (to_bytes) {
  2522. one = scheme_make_sized_byte_string(c, clen, 0);
  2523. } else {
  2524. one = scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0);
  2525. }
  2526. parts = scheme_make_pair(one, parts);
  2527. if (len) {
  2528. /* Conversion error, so skip one char. */
  2529. if (to_bytes) {
  2530. char bc[1];
  2531. bc[0] = perm;
  2532. one = scheme_make_sized_byte_string(bc, 1, 1);
  2533. } else {
  2534. mzchar bc[1];
  2535. bc[0] = perm;
  2536. one = scheme_make_sized_char_string(bc, 1, 1);
  2537. }
  2538. parts = scheme_make_pair(one, parts);
  2539. delta += 1;
  2540. len -= 1;
  2541. }
  2542. }
  2543. iconv_close(cd);
  2544. if (to_bytes) {
  2545. parts = append_all_byte_strings_backwards(parts);
  2546. *olen = SCHEME_BYTE_STRTAG_VAL(parts);
  2547. return SCHEME_BYTE_STR_VAL(parts);
  2548. } else {
  2549. parts = append_all_strings_backwards(parts);
  2550. *olen = SCHEME_CHAR_STRTAG_VAL(parts);
  2551. return (char *)SCHEME_CHAR_STR_VAL(parts);
  2552. }
  2553. }
  2554. static char *locale_recase(int to_up,
  2555. /* in must be null-terminated, iilen doesn't include it */
  2556. char *in, int id, int iilen,
  2557. /* iolen, in contrast, includes the terminator */
  2558. char *out, int od, int iolen,
  2559. intptr_t *oolen)
  2560. /* Assumes that reset_locale() has been called */
  2561. {
  2562. #ifdef NO_MBTOWC_FUNCTIONS
  2563. /* No wide-char functions...
  2564. The C library's toupper and tolower is supposed to be
  2565. locale-sensitive. It can't be right for characters that are
  2566. encoded in multiple bytes, but probably it will do the right
  2567. thing in common cases. */
  2568. int i;
  2569. /* First, copy "in" to "out" */
  2570. if (iilen + 1 >= iolen) {
  2571. out = (char *)scheme_malloc_atomic(iilen + 1);
  2572. od = 0;
  2573. }
  2574. memcpy(out + od, in + id, iilen);
  2575. out[od + iilen] = 0;
  2576. *oolen = iilen;
  2577. /* Re-case chars in "out" */
  2578. for (i = 0; i < iilen; i++) {
  2579. if (to_up)
  2580. out[od + i] = toupper(out[od + i]);
  2581. else
  2582. out[od + i] = tolower(out[od + i]);
  2583. }
  2584. return out;
  2585. #else
  2586. /* To change the case, convert the string to multibyte, re-case the
  2587. multibyte, then convert back. */
  2588. # define MZ_WC_BUF_SIZE 32
  2589. GC_CAN_IGNORE mbstate_t state;
  2590. size_t wl, wl2, ml, ml2;
  2591. wchar_t *wc, *ws, wcbuf[MZ_WC_BUF_SIZE], cwc;
  2592. const char *s;
  2593. unsigned int j;
  2594. /* The "n" versions are apparently not too standard: */
  2595. # define mz_mbsnrtowcs(t, f, fl, tl, s) mbsrtowcs(t, f, tl, s)
  2596. # define mz_wcsnrtombs(t, f, fl, tl, s) wcsrtombs(t, f, tl, s)
  2597. /* ----- to wide char ---- */
  2598. /* Get length */
  2599. memset(&state, 0, sizeof(mbstate_t));
  2600. s = in XFORM_OK_PLUS id;
  2601. wl = mz_mbsnrtowcs(NULL, &s, iilen, 0, &state);
  2602. s = NULL;
  2603. /* Allocate space */
  2604. if (wl < MZ_WC_BUF_SIZE) {
  2605. wc = wcbuf;
  2606. } else {
  2607. wc = (wchar_t *)scheme_malloc_atomic(sizeof(wchar_t) * (wl + 1));
  2608. }
  2609. /* Convert */
  2610. memset(&state, 0, sizeof(mbstate_t));
  2611. s = in XFORM_OK_PLUS id;
  2612. wl2 = mz_mbsnrtowcs(wc, &s, iilen, wl + 1, &state);
  2613. s = NULL;
  2614. wc[wl] = 0; /* just in case */
  2615. /* ---- re-case ---- */
  2616. if (to_up) {
  2617. for (j = 0; j < wl; j++) {
  2618. cwc = towupper(wc[j]);
  2619. wc[j] = cwc;
  2620. }
  2621. } else {
  2622. for (j = 0; j < wl; j++) {
  2623. cwc = towlower(wc[j]);
  2624. wc[j] = cwc;
  2625. }
  2626. }
  2627. /* ---- back to multibyte ---- */
  2628. /* Measure */
  2629. memset(&state, 0, sizeof(mbstate_t));
  2630. ws = wc;
  2631. ml = mz_wcsnrtombs(NULL, (const wchar_t **)&ws, wl, 0, &state);
  2632. ws = NULL;
  2633. /* Allocate space */
  2634. *oolen = ml;
  2635. if (ml + 1 >= (unsigned int)iolen) {
  2636. out = (char *)scheme_malloc_atomic(ml + 1);
  2637. od = 0;
  2638. }
  2639. /* Convert */
  2640. memset(&state, 0, sizeof(mbstate_t));
  2641. ws = wc;
  2642. ml2 = mz_wcsnrtombs(out + od, (const wchar_t **)&ws, wl, ml + 1, &state);
  2643. ws = NULL;
  2644. out[od + ml] = 0;
  2645. return out;
  2646. #endif
  2647. }
  2648. int mz_locale_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
  2649. /* The s1 and s2 arguments are actually UCS-4.
  2650. Assumes that reset_locale() has been called. */
  2651. {
  2652. intptr_t clen1, clen2, used1, used2, origl1, origl2;
  2653. char *c1, *c2, buf1[MZ_SC_BUF_SIZE], buf2[MZ_SC_BUF_SIZE];
  2654. char case_buf1[MZ_SC_BUF_SIZE], case_buf2[MZ_SC_BUF_SIZE];
  2655. int status, got_more;
  2656. /* First, convert UCS-4 to locale-specific encoding. If some
  2657. characters don't fit into the encoding, then we'll have leftover
  2658. characters. Count unconvertable charc as greater than anything
  2659. that can be converted */
  2660. origl1 = l1;
  2661. origl2 = l2;
  2662. /* Loop to check both convertable and unconvertable parts */
  2663. while (1) {
  2664. if (!origl1 && !origl2)
  2665. return 0;
  2666. if (!origl1)
  2667. return -1;
  2668. if (!origl2)
  2669. return 1;
  2670. /* Loop to get consistent parts of the wto strings, in case
  2671. a conversion fails. */
  2672. got_more = 0;
  2673. l1 = origl1;
  2674. l2 = origl2;
  2675. while (1) {
  2676. c1 = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
  2677. s1, d1 * 4, 4 * l1,
  2678. buf1, 0, MZ_SC_BUF_SIZE - 1,
  2679. 1 /* grow */, 0, 1 /* terminator size */,
  2680. &used1, &clen1,
  2681. &status);
  2682. c2 = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
  2683. s2, d2 * 4, 4 * l2,
  2684. buf2, 0, MZ_SC_BUF_SIZE - 1,
  2685. 1 /* grow */, 0, 1 /* terminator size */,
  2686. &used2, &clen2,
  2687. &status);
  2688. if ((used1 < 4 * l1) || (used2 < 4 * l2)) {
  2689. if (got_more) {
  2690. /* Something went wrong. We've already tried to
  2691. even out the parts that work. Let's give up
  2692. on the first characters */
  2693. clen1 = clen2 = 0;
  2694. break;
  2695. } else if (used1 == used2) {
  2696. /* Not everything, but both ended at the same point */
  2697. break;
  2698. } else {
  2699. /* Pick the smallest */
  2700. if (used2 < used1) {
  2701. used1 = used2;
  2702. got_more = 1;
  2703. } else
  2704. got_more = 2;
  2705. l2 = (used1 >> 2);
  2706. l1 = (used1 >> 2);
  2707. if (!l1) {
  2708. /* Nothing to get this time. */
  2709. clen1 = clen2 = 0;
  2710. c1 = c2 = "";
  2711. used1 = used2 = 0;
  2712. break;
  2713. }
  2714. }
  2715. } else
  2716. /* Got all that we wanted */
  2717. break;
  2718. }
  2719. if (cvt_case) {
  2720. if (clen1)
  2721. c1 = locale_recase(0, c1, 0, clen1,
  2722. case_buf1, 0, MZ_SC_BUF_SIZE - 1,
  2723. &clen1);
  2724. else
  2725. c1 = NULL;
  2726. if (clen2)
  2727. c2 = locale_recase(0, c2, 0, clen2,
  2728. case_buf2, 0, MZ_SC_BUF_SIZE - 1,
  2729. &clen2);
  2730. else
  2731. c2 = NULL;
  2732. /* There shouldn't have been conversion errors, but just in
  2733. case, care of NULL. */
  2734. if (!c1) c1 = "";
  2735. if (!c2) c2 = "";
  2736. }
  2737. /* Collate, finally. */
  2738. status = strcoll(c1, c2);
  2739. /* If one is bigger than the other, we're done. */
  2740. if (status)
  2741. return status;
  2742. /* Otherwise, is there more to check? */
  2743. origl1 -= (used1 >> 2);
  2744. origl2 -= (used2 >> 2);
  2745. d1 += (used1 >> 2);
  2746. d2 += (used2 >> 2);
  2747. if (!origl1 && !origl2)
  2748. return 0;
  2749. /* There's more. It must be that the next character wasn't
  2750. convertable in one of the encodings. */
  2751. if (got_more)
  2752. return ((got_more == 2) ? 1 : -1);
  2753. if (!origl1)
  2754. return -1;
  2755. /* Compare an unconverable character directly. No case conversions
  2756. if it's outside the locale. */
  2757. if (((unsigned int *)s1)[d1] > ((unsigned int *)s2)[d2])
  2758. return 1;
  2759. else if (((unsigned int *)s1)[d1] < ((unsigned int *)s2)[d2])
  2760. return -1;
  2761. else {
  2762. /* We've skipped one unconvertable char, and they still look the
  2763. same. Now try again. */
  2764. origl1 -= 1;
  2765. origl2 -= 1;
  2766. d1 += 1;
  2767. d2 += 1;
  2768. }
  2769. }
  2770. }
  2771. #ifdef MACOS_UNICODE_SUPPORT
  2772. int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
  2773. /* The s1 and s2 arguments are actually UTF-16. */
  2774. {
  2775. CFStringRef str1, str2;
  2776. CFComparisonResult r;
  2777. str1 = CFStringCreateWithBytes(NULL, (unsigned char *)s1 XFORM_OK_PLUS (d1 * 2), (l1 * 2),
  2778. kCFStringEncodingUnicode, FALSE);
  2779. str2 = CFStringCreateWithBytes(NULL, (unsigned char *)s2 XFORM_OK_PLUS (d2 * 2), (l2 * 2),
  2780. kCFStringEncodingUnicode, FALSE);
  2781. r = CFStringCompare(str1, str2, (kCFCompareLocalized
  2782. | (cvt_case ? kCFCompareCaseInsensitive : 0)));
  2783. CFRelease(str1);
  2784. CFRelease(str2);
  2785. return (int)r;
  2786. }
  2787. #endif
  2788. #ifdef WINDOWS_UNICODE_SUPPORT
  2789. int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
  2790. /* The s1 and s2 arguments are actually UTF-16. */
  2791. {
  2792. int r;
  2793. r = CompareStringW(LOCALE_USER_DEFAULT,
  2794. ((cvt_case ? NORM_IGNORECASE : 0)
  2795. | NORM_IGNOREKANATYPE
  2796. | NORM_IGNOREWIDTH),
  2797. (wchar_t *)s1 + d1, l1, (wchar_t *)s2 + d2, l2);
  2798. return r - 2;
  2799. }
  2800. #endif
  2801. typedef int (*strcoll_proc)(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case);
  2802. int do_locale_comp(const char *who, const mzchar *us1, intptr_t ul1, const mzchar *us2, intptr_t ul2, int cvt_case)
  2803. {
  2804. int xl1;
  2805. int v, endres, utf16 = 0;
  2806. GC_CAN_IGNORE strcoll_proc mz_strcoll = mz_locale_strcoll;
  2807. #if defined(MACOS_UNICODE_SUPPORT) || defined(WINDOWS_UNICODE_SUPPORT)
  2808. if (current_locale_name && !*current_locale_name) {
  2809. utf16 = 1;
  2810. mz_strcoll = mz_native_strcoll;
  2811. }
  2812. #endif
  2813. if (utf16) {
  2814. us1 = (mzchar *)scheme_ucs4_to_utf16(us1, 0, ul1, NULL, 0, &ul1, 1);
  2815. us2 = (mzchar *)scheme_ucs4_to_utf16(us2, 0, ul2, NULL, 0, &ul2, 1);
  2816. ((short *)us1)[ul1] = 0;
  2817. ((short *)us2)[ul2] = 0;
  2818. }
  2819. if (ul1 > ul2) {
  2820. ul1 = ul2;
  2821. endres = 1;
  2822. } else {
  2823. if (ul2 > ul1)
  2824. endres = -1;
  2825. else
  2826. endres = 0;
  2827. }
  2828. /* Walk back through the strings looking for nul characters. If we
  2829. find one, compare the part after the null character to update
  2830. endres, then continue. Unfortunately, we do too much work if an
  2831. earlier part of the string (tested later) determines the result,
  2832. but hopefully nul characters are rare. */
  2833. xl1 = 0;
  2834. while (ul1--) {
  2835. if ((utf16 && (!(((short *)us1)[ul1]) || !(((short *)us2)[ul1])))
  2836. || (!utf16 && (!(us1[ul1]) || !(us2[ul1])))) {
  2837. if (utf16) {
  2838. if (((short *)us1)[ul1])
  2839. endres = 1;
  2840. else if (((short *)us2)[ul1])
  2841. endres = -1;
  2842. } else {
  2843. if (us1[ul1])
  2844. endres = 1;
  2845. else if (us2[ul1])
  2846. endres = -1;
  2847. }
  2848. if (xl1)
  2849. v = mz_strcoll((char *)us1, ul1 + 1, xl1, (char *)us2, ul1 + 1, xl1, cvt_case);
  2850. else
  2851. v = 0;
  2852. if (v)
  2853. endres = v;
  2854. xl1 = 0;
  2855. } else {
  2856. xl1++;
  2857. }
  2858. }
  2859. v = mz_strcoll((char *)us1, 0, xl1, (char *)us2, 0, xl1, cvt_case);
  2860. if (v)
  2861. endres = v;
  2862. return endres;
  2863. }
  2864. mzchar *do_locale_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
  2865. {
  2866. Scheme_Object *parts = scheme_null;
  2867. char *c, buf[MZ_SC_BUF_SIZE], case_buf[MZ_SC_BUF_SIZE];
  2868. intptr_t clen, used;
  2869. int status;
  2870. while (len) {
  2871. /* We might have conversion errors... */
  2872. c = do_convert((iconv_t)-1, MZ_UCS4_NAME, NULL, 1,
  2873. (char *)in, 4 * delta, 4 * len,
  2874. buf, 0, MZ_SC_BUF_SIZE - 1,
  2875. 1 /* grow */, 0, 1 /* terminator size */,
  2876. &used, &clen,
  2877. &status);
  2878. used >>= 2;
  2879. delta += used;
  2880. len -= used;
  2881. c = locale_recase(to_up, c, 0, clen,
  2882. case_buf, 0, MZ_SC_BUF_SIZE - 1,
  2883. &clen);
  2884. if (!c)
  2885. clen = 0;
  2886. c = do_convert((iconv_t)-1, NULL, MZ_UCS4_NAME, 2,
  2887. c, 0, clen,
  2888. NULL, 0, 0,
  2889. 1 /* grow */, 0, sizeof(mzchar) /* terminator size */,
  2890. &used, &clen,
  2891. &status);
  2892. if (!len && SCHEME_NULLP(parts)) {
  2893. *olen = (clen >> 2);
  2894. ((mzchar *)c)[*olen] = 0;
  2895. return (mzchar *)c;
  2896. }
  2897. /* We can get here if there was some conversion error at some
  2898. point. We're building up a list of parts. */
  2899. parts = scheme_make_pair(scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0),
  2900. parts);
  2901. if (len) {
  2902. /* Conversion error, so skip one char. */
  2903. parts = scheme_make_pair(scheme_make_sized_offset_char_string(in, delta, 1, 1),
  2904. parts);
  2905. delta += 1;
  2906. len -= 1;
  2907. }
  2908. }
  2909. parts = append_all_strings_backwards(parts);
  2910. *olen = SCHEME_CHAR_STRTAG_VAL(parts);
  2911. return SCHEME_CHAR_STR_VAL(parts);
  2912. }
  2913. #ifdef MACOS_UNICODE_SUPPORT
  2914. mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
  2915. /* The in argument is actually UTF-16. */
  2916. {
  2917. CFMutableStringRef mstr;
  2918. CFStringRef str;
  2919. GC_CAN_IGNORE CFRange rng;
  2920. char *result;
  2921. str = CFStringCreateWithBytes(NULL, ((unsigned char *)in) XFORM_OK_PLUS (delta * 2), (len * 2),
  2922. kCFStringEncodingUnicode, FALSE);
  2923. mstr = CFStringCreateMutableCopy(NULL, 0, str);
  2924. CFRelease(str);
  2925. if (to_up)
  2926. CFStringUppercase(mstr, NULL);
  2927. else
  2928. CFStringLowercase(mstr, NULL);
  2929. len = CFStringGetLength(mstr);
  2930. *olen = len;
  2931. result = (char *)scheme_malloc_atomic((len + 1) * 2);
  2932. rng = CFRangeMake(0, len);
  2933. CFStringGetCharacters(mstr, rng, (UniChar *)result);
  2934. CFRelease(mstr);
  2935. ((UniChar *)result)[len] = 0;
  2936. return (mzchar *)result;
  2937. }
  2938. #endif
  2939. #ifdef WINDOWS_UNICODE_SUPPORT
  2940. mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
  2941. /* The in argument is actually UTF-16. */
  2942. {
  2943. char *result;
  2944. result = (char *)scheme_malloc_atomic((len + 1) * 2);
  2945. memcpy(result, ((char *)in) + (2 * delta), len * 2);
  2946. ((wchar_t*)result)[len] = 0;
  2947. if (to_up)
  2948. CharUpperBuffW((wchar_t *)result, len);
  2949. else
  2950. CharLowerBuffW((wchar_t *)result, len);
  2951. *olen = len;
  2952. return (mzchar *)result;
  2953. }
  2954. #endif
  2955. typedef mzchar *(*recase_proc)(int to_up, mzchar *in, int delta, int len, intptr_t *olen);
  2956. static Scheme_Object *mz_recase(const char *who, int to_up, mzchar *us, intptr_t ulen)
  2957. {
  2958. intptr_t ulen1;
  2959. int utf16 = 0, i, delta = 0;
  2960. mzchar *us1;
  2961. recase_proc mz_do_recase = do_locale_recase;
  2962. Scheme_Object *s, *parts = scheme_null;
  2963. reset_locale();
  2964. #if defined(MACOS_UNICODE_SUPPORT) || defined(WINDOWS_UNICODE_SUPPORT)
  2965. if (current_locale_name && !*current_locale_name) {
  2966. utf16 = 1;
  2967. mz_do_recase = do_native_recase;
  2968. }
  2969. #endif
  2970. if (utf16) {
  2971. us = (mzchar *)scheme_ucs4_to_utf16(us, 0, ulen, NULL, 0, &ulen, 1);
  2972. ((short *)us)[ulen] = 0;
  2973. }
  2974. /* If there are nulls in the string, then we have to make multiple
  2975. calls to mz_do_recase */
  2976. i = 0;
  2977. while (1) {
  2978. for (; i < ulen; i++) {
  2979. if (utf16) {
  2980. if (!((short *)us)[i])
  2981. break;
  2982. } else if (!us[i])
  2983. break;
  2984. }
  2985. us1 = mz_do_recase(to_up, us, delta, i - delta, &ulen1);
  2986. if (utf16) {
  2987. us1 = scheme_utf16_to_ucs4((unsigned short *)us1, 0, ulen1, NULL, 0, &ulen1, 1);
  2988. us1[ulen1] = 0;
  2989. }
  2990. s = scheme_make_sized_char_string((mzchar *)us1, ulen1, 0);
  2991. if (SCHEME_NULLP(parts) && (i == ulen))
  2992. return s;
  2993. parts = scheme_make_pair(s, parts);
  2994. if (i == ulen)
  2995. break;
  2996. /* upcasing and encoding a nul char is easy: */
  2997. s = scheme_make_sized_char_string((mzchar *)"\0\0\0\0", 1, 0);
  2998. parts = scheme_make_pair(s, parts);
  2999. i++;
  3000. delta = i;
  3001. if (i == ulen)
  3002. break;
  3003. }
  3004. return append_all_strings_backwards(parts);
  3005. }
  3006. #endif
  3007. static Scheme_Object *
  3008. unicode_recase(const char *who, int to_up, int argc, Scheme_Object *argv[])
  3009. {
  3010. intptr_t len;
  3011. mzchar *chars;
  3012. if (!SCHEME_CHAR_STRINGP(argv[0]))
  3013. scheme_wrong_type(who, "string", 0, argc, argv);
  3014. chars = SCHEME_CHAR_STR_VAL(argv[0]);
  3015. len = SCHEME_CHAR_STRTAG_VAL(argv[0]);
  3016. return mz_recase(who, to_up, chars, len);
  3017. }
  3018. static Scheme_Object *
  3019. string_locale_upcase(int argc, Scheme_Object *argv[])
  3020. {
  3021. return unicode_recase("string-locale-upcase", 1, argc, argv);
  3022. }
  3023. static Scheme_Object *
  3024. string_locale_downcase(int argc, Scheme_Object *argv[])
  3025. {
  3026. return unicode_recase("string-locale-downcase", 0, argc, argv);
  3027. }
  3028. static void reset_locale(void)
  3029. {
  3030. Scheme_Object *v;
  3031. const mzchar *name;
  3032. /* This function needs to work before threads are set up: */
  3033. if (scheme_current_thread) {
  3034. v = scheme_get_param(scheme_current_config(), MZCONFIG_LOCALE);
  3035. } else {
  3036. v = scheme_make_immutable_sized_utf8_string("", 0);
  3037. }
  3038. locale_on = SCHEME_TRUEP(v);
  3039. if (locale_on) {
  3040. name = SCHEME_CHAR_STR_VAL(v);
  3041. #ifndef DONT_USE_LOCALE
  3042. if ((current_locale_name != name)
  3043. && (!current_locale_name
  3044. || mz_char_strcmp("result-locale",
  3045. current_locale_name, scheme_char_strlen(current_locale_name),
  3046. name, SCHEME_CHAR_STRLEN_VAL(v),
  3047. 0, 1))) {
  3048. /* We only need CTYPE and COLLATE; two calls seem to be much
  3049. faster than one call with ALL */
  3050. char *n, buf[32];
  3051. n = scheme_utf8_encode_to_buffer(name, SCHEME_CHAR_STRLEN_VAL(v), buf, 32);
  3052. if (!setlocale(LC_CTYPE, n))
  3053. setlocale(LC_CTYPE, "C");
  3054. if (!setlocale(LC_COLLATE, n))
  3055. setlocale(LC_COLLATE, "C");
  3056. }
  3057. #endif
  3058. current_locale_name_ptr = (void *)name;
  3059. }
  3060. }
  3061. char *scheme_push_c_numeric_locale()
  3062. {
  3063. #ifndef DONT_USE_LOCALE
  3064. GC_CAN_IGNORE char *prev;
  3065. prev = setlocale(LC_NUMERIC, NULL);
  3066. if (!strcmp(prev, "C"))
  3067. return NULL;
  3068. else
  3069. return setlocale(LC_NUMERIC, "C");
  3070. #endif
  3071. }
  3072. void scheme_pop_c_numeric_locale(char *prev)
  3073. {
  3074. #ifndef DONT_USE_LOCALE
  3075. if (prev)
  3076. setlocale(LC_NUMERIC, prev);
  3077. #endif
  3078. }
  3079. static int find_special_casing(int ch)
  3080. {
  3081. /* Binary search */
  3082. int i, lo, hi, j;
  3083. i = NUM_SPECIAL_CASINGS >> 1;
  3084. lo = i;
  3085. hi = NUM_SPECIAL_CASINGS - i - 1;
  3086. while (1) {
  3087. if (uchar_special_casings[i * 10] == ch)
  3088. return i * 10;
  3089. if (uchar_special_casings[i * 10] > ch) {
  3090. j = i - lo;
  3091. i = j + (lo >> 1);
  3092. hi = lo - (i - j) - 1;
  3093. lo = i - j;
  3094. } else {
  3095. j = i + 1;
  3096. i = j + (hi >> 1);
  3097. lo = i - j;
  3098. hi = hi - (i - j) - 1;
  3099. }
  3100. }
  3101. }
  3102. static int is_final_sigma(int mode, mzchar *s, int d, int i, int len)
  3103. {
  3104. int j;
  3105. if (mode == 3)
  3106. return 1;
  3107. /* find a cased char before, skipping case-ignorable: */
  3108. for (j = i - 1; j >= d; j--) {
  3109. if (!scheme_iscaseignorable(s[j])) {
  3110. if (scheme_iscased(s[j]))
  3111. break;
  3112. else
  3113. return 0;
  3114. }
  3115. }
  3116. if (j < d)
  3117. return 0;
  3118. /* next non-case-ignorable must not be cased: */
  3119. for (j = i + 1; j < d + len; j++) {
  3120. if (!scheme_iscaseignorable(s[j])) {
  3121. return !scheme_iscased(s[j]);
  3122. }
  3123. }
  3124. return 1;
  3125. }
  3126. mzchar *scheme_string_recase(mzchar *s, int d, int len, int mode, int inplace, int *_len)
  3127. {
  3128. mzchar *t;
  3129. int i, extra = 0, pos, special = 0, td, prev_was_cased = 0, xmode = mode;
  3130. for (i = 0; i < len; i++) {
  3131. if (scheme_isspecialcasing(s[d+i])) {
  3132. pos = find_special_casing(s[d+i]);
  3133. if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
  3134. special = 1;
  3135. extra += (uchar_special_casings[pos + 1 + (xmode << 1)] - 1);
  3136. }
  3137. }
  3138. if (mode == 2) {
  3139. if (!scheme_iscaseignorable(s[d+i]))
  3140. prev_was_cased = scheme_iscased(s[d+i]);
  3141. xmode = (prev_was_cased ? 0 : 2);
  3142. }
  3143. }
  3144. if (_len)
  3145. *_len = len + extra;
  3146. if (!extra && inplace) {
  3147. t = s;
  3148. td = d;
  3149. } else {
  3150. t = scheme_malloc_atomic(sizeof(mzchar) * (len + extra + 1));
  3151. td = 0;
  3152. }
  3153. if (!special) {
  3154. if (mode == 0) {
  3155. for (i = 0; i < len; i++) {
  3156. t[i+td] = scheme_tolower(s[i+d]);
  3157. }
  3158. } else if (mode == 1) {
  3159. for (i = 0; i < len; i++) {
  3160. t[i+td] = scheme_toupper(s[i+d]);
  3161. }
  3162. } else if (mode == 2) {
  3163. prev_was_cased = 0;
  3164. for (i = 0; i < len; i++) {
  3165. if (!prev_was_cased)
  3166. t[i+td] = scheme_totitle(s[i+d]);
  3167. else
  3168. t[i+td] = scheme_tolower(s[i+d]);
  3169. if (!scheme_iscaseignorable(s[i+d]))
  3170. prev_was_cased = scheme_iscased(s[i+d]);
  3171. }
  3172. } else /* if (mode == 3) */ {
  3173. for (i = 0; i < len; i++) {
  3174. t[i+td] = scheme_tofold(s[i+d]);
  3175. }
  3176. }
  3177. } else {
  3178. int j = 0, c;
  3179. prev_was_cased = 0;
  3180. for (i = 0; i < len; i++) {
  3181. if (mode == 0) {
  3182. t[j+td] = scheme_tolower(s[i+d]);
  3183. } else if (mode == 1) {
  3184. t[j+td] = scheme_toupper(s[i+d]);
  3185. } else if (mode == 2) {
  3186. if (!prev_was_cased) {
  3187. xmode = 2;
  3188. t[j+td] = scheme_totitle(s[i+d]);
  3189. } else {
  3190. xmode = 0;
  3191. t[j+td] = scheme_tolower(s[i+d]);
  3192. }
  3193. if (!scheme_iscaseignorable(s[i+d]))
  3194. prev_was_cased = scheme_iscased(s[i+d]);
  3195. } else /* if (mode == 3) */ {
  3196. t[j+td] = scheme_tofold(s[i+d]);
  3197. }
  3198. if (scheme_isspecialcasing(s[i+d])) {
  3199. pos = find_special_casing(s[i+d]);
  3200. if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
  3201. c = uchar_special_casings[pos + 1 + (xmode << 1)];
  3202. pos = uchar_special_casings[pos + 2 + (xmode << 1)];
  3203. while (c--) {
  3204. t[(j++)+td] = uchar_special_casing_data[pos++];
  3205. }
  3206. } else
  3207. j++;
  3208. } else
  3209. j++;
  3210. }
  3211. }
  3212. t[len+extra+td] = 0;
  3213. return t;
  3214. }
  3215. static Scheme_Object *string_recase (const char *name, int argc, Scheme_Object *argv[], int mode)
  3216. {
  3217. mzchar *s;
  3218. int len;
  3219. if (!SCHEME_CHAR_STRINGP(argv[0]))
  3220. scheme_wrong_type(name, "string", 0, argc, argv);
  3221. s = SCHEME_CHAR_STR_VAL(argv[0]);
  3222. len = SCHEME_CHAR_STRLEN_VAL(argv[0]);
  3223. s = scheme_string_recase(s, 0, len, mode, 0, &len);
  3224. return scheme_make_sized_char_string(s, len, 0);
  3225. }
  3226. static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[])
  3227. {
  3228. return string_recase("string-upcase", argc, argv, 1);
  3229. }
  3230. static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[])
  3231. {
  3232. return string_recase("string-downcase", argc, argv, 0);
  3233. }
  3234. static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[])
  3235. {
  3236. return string_recase("string-titlecase", argc, argv, 2);
  3237. }
  3238. static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
  3239. {
  3240. return string_recase("string-foldcase", argc, argv, 3);
  3241. }
  3242. /**********************************************************************/
  3243. /* normalization */
  3244. /**********************************************************************/
  3245. #define MZ_JAMO_INITIAL_CONSONANT_START 0x1100
  3246. #define MZ_JAMO_INITIAL_CONSONANT_COUNT 19
  3247. #define MZ_JAMO_INITIAL_CONSONANT_END (MZ_JAMO_INITIAL_CONSONANT_START + MZ_JAMO_INITIAL_CONSONANT_COUNT - 1)
  3248. #define MZ_JAMO_VOWEL_START 0x1161
  3249. #define MZ_JAMO_VOWEL_COUNT 21
  3250. #define MZ_JAMO_VOWEL_END (MZ_JAMO_VOWEL_START + MZ_JAMO_VOWEL_COUNT - 1)
  3251. /* First in this range is not actually a consonant, but a placeholder for "no consonant" */
  3252. #define MZ_JAMO_TRAILING_CONSONANT_START 0x11A7
  3253. #define MZ_JAMO_TRAILING_CONSONANT_COUNT 28
  3254. #define MZ_JAMO_TRAILING_CONSONANT_END (MZ_JAMO_TRAILING_CONSONANT_START + MZ_JAMO_TRAILING_CONSONANT_COUNT - 1)
  3255. #define MZ_JAMO_SYLLABLE_START 0xAC00
  3256. #define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
  3257. static mzchar get_composition(mzchar a, mzchar b)
  3258. {
  3259. uintptr_t key = (a << 16) | b;
  3260. int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
  3261. int below_len = pos;
  3262. int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
  3263. if (a > 0xFFFF) return 0;
  3264. /* Binary search: */
  3265. while (key != utable_compose_pairs[pos]) {
  3266. if (key > utable_compose_pairs[pos]) {
  3267. if (!above_len)
  3268. return 0;
  3269. new_pos = pos + (above_len >> 1) + 1;
  3270. below_len = (new_pos - pos - 1);
  3271. above_len = (above_len - below_len - 1);
  3272. pos = new_pos;
  3273. } else if (key < utable_compose_pairs[pos]) {
  3274. if (!below_len)
  3275. return 0;
  3276. new_pos = pos - ((below_len >> 1) + 1);
  3277. above_len = (pos - new_pos - 1);
  3278. below_len = (below_len - above_len - 1);
  3279. pos = new_pos;
  3280. }
  3281. }
  3282. return utable_compose_result[pos];
  3283. }
  3284. mzchar get_canon_decomposition(mzchar key, mzchar *b)
  3285. {
  3286. int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
  3287. int below_len = pos;
  3288. int above_len = (DECOMPOSE_TABLE_SIZE - pos - 1);
  3289. /* Binary search: */
  3290. while (key != utable_decomp_keys[pos]) {
  3291. if (key > utable_decomp_keys[pos]) {
  3292. if (!above_len)
  3293. return 0;
  3294. new_pos = pos + (above_len >> 1) + 1;
  3295. below_len = (new_pos - pos - 1);
  3296. above_len = (above_len - below_len - 1);
  3297. pos = new_pos;
  3298. } else if (key < utable_decomp_keys[pos]) {
  3299. if (!below_len)
  3300. return 0;
  3301. new_pos = pos - ((below_len >> 1) + 1);
  3302. above_len = (pos - new_pos - 1);
  3303. below_len = (below_len - above_len - 1);
  3304. pos = new_pos;
  3305. }
  3306. }
  3307. pos = utable_decomp_indices[pos];
  3308. if (pos < 0) {
  3309. pos = -(pos + 1);
  3310. pos <<= 1;
  3311. *b = utable_compose_long_pairs[pos + 1];
  3312. return utable_compose_long_pairs[pos];
  3313. } else {
  3314. key = utable_compose_pairs[pos];
  3315. *b = (key & 0xFFFF);
  3316. return (key >> 16);
  3317. }
  3318. }
  3319. int get_kompat_decomposition(mzchar key, unsigned short **chars)
  3320. {
  3321. int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
  3322. int below_len = pos;
  3323. int above_len = (KOMPAT_DECOMPOSE_TABLE_SIZE - pos - 1);
  3324. /* Binary search: */
  3325. while (key != utable_kompat_decomp_keys[pos]) {
  3326. if (key > utable_kompat_decomp_keys[pos]) {
  3327. if (!above_len)
  3328. return 0;
  3329. new_pos = pos + (above_len >> 1) + 1;
  3330. below_len = (new_pos - pos - 1);
  3331. above_len = (above_len - below_len - 1);
  3332. pos = new_pos;
  3333. } else if (key < utable_kompat_decomp_keys[pos]) {
  3334. if (!below_len)
  3335. return 0;
  3336. new_pos = pos - ((below_len >> 1) + 1);
  3337. above_len = (pos - new_pos - 1);
  3338. below_len = (below_len - above_len - 1);
  3339. pos = new_pos;
  3340. }
  3341. }
  3342. *chars = utable_kompat_decomp_strs XFORM_OK_PLUS utable_kompat_decomp_indices[pos];
  3343. return utable_kompat_decomp_lens[pos];
  3344. }
  3345. static Scheme_Object *normalize_c(Scheme_Object *o)
  3346. /* Assumes then given string is in normal form D */
  3347. {
  3348. mzchar *s, *s2, tmp, last_c0 = 0;
  3349. int len, i, j = 0, last_c0_pos = 0, last_cc = 0;
  3350. s = SCHEME_CHAR_STR_VAL(o);
  3351. len = SCHEME_CHAR_STRLEN_VAL(o);
  3352. s2 = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
  3353. memcpy(s2, s, len * sizeof(mzchar));
  3354. for (i = 0; i < len; i++) {
  3355. if ((i + 1 < len)
  3356. && (s2[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
  3357. && (s2[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
  3358. && (s2[i+1] >= MZ_JAMO_VOWEL_START)
  3359. && (s2[i+1] <= MZ_JAMO_VOWEL_END)) {
  3360. /* Need Hangul composition */
  3361. if ((i + 2 < len)
  3362. && (s2[i+2] > MZ_JAMO_TRAILING_CONSONANT_START)
  3363. && (s2[i+2] <= MZ_JAMO_TRAILING_CONSONANT_END)) {
  3364. /* 3-char composition */
  3365. tmp = (MZ_JAMO_SYLLABLE_START
  3366. + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
  3367. * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
  3368. + ((s2[i+1] - MZ_JAMO_VOWEL_START)
  3369. * MZ_JAMO_TRAILING_CONSONANT_COUNT)
  3370. + (s2[i+2] - MZ_JAMO_TRAILING_CONSONANT_START));
  3371. i += 2;
  3372. } else {
  3373. /* 2-char composition */
  3374. tmp = (MZ_JAMO_SYLLABLE_START
  3375. + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
  3376. * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
  3377. + ((s2[i+1] - MZ_JAMO_VOWEL_START)
  3378. * MZ_JAMO_TRAILING_CONSONANT_COUNT));
  3379. i++;
  3380. }
  3381. last_c0 = tmp;
  3382. last_c0_pos = j;
  3383. last_cc = 0;
  3384. s2[j++] = tmp;
  3385. } else {
  3386. int cc;
  3387. cc = scheme_combining_class(s2[i]);
  3388. if (last_c0 && (cc > last_cc))
  3389. tmp = get_composition(last_c0, s2[i]);
  3390. else
  3391. tmp = 0;
  3392. if (tmp) {
  3393. /* Need to compose */
  3394. s2[last_c0_pos] = tmp;
  3395. last_c0 = tmp;
  3396. } else if (!cc) {
  3397. /* Reset last_c0... */
  3398. tmp = s2[i];
  3399. if (scheme_needs_maybe_compose(tmp)) {
  3400. last_c0 = tmp;
  3401. last_c0_pos = j;
  3402. } else {
  3403. last_c0 = 0;
  3404. }
  3405. last_cc = -1;
  3406. s2[j++] = tmp;
  3407. } else {
  3408. s2[j++] = s2[i];
  3409. last_cc = cc;
  3410. }
  3411. }
  3412. }
  3413. s2[j] = 0;
  3414. if (len - j > 16) {
  3415. s2 = (mzchar *)scheme_malloc_atomic((j + 1) * sizeof(mzchar));
  3416. memcpy(s2, s, (j + 1) * sizeof(mzchar));
  3417. s2 = s;
  3418. }
  3419. return scheme_make_sized_char_string(s2, j, 0);
  3420. }
  3421. static Scheme_Object *normalize_d(Scheme_Object *o, int kompat)
  3422. {
  3423. mzchar *s, tmp, *s2;
  3424. int len, i, delta, j, swapped;
  3425. s = SCHEME_CHAR_STR_VAL(o);
  3426. len = SCHEME_CHAR_STRLEN_VAL(o);
  3427. /* Run through string list to predict expansion: */
  3428. delta = 0;
  3429. for (i = 0; i < len; i++) {
  3430. if (scheme_needs_decompose(s[i])) {
  3431. int klen;
  3432. mzchar snd;
  3433. GC_CAN_IGNORE unsigned short *start;
  3434. tmp = s[i];
  3435. while (scheme_needs_decompose(tmp)) {
  3436. if (kompat)
  3437. klen = get_kompat_decomposition(tmp, &start);
  3438. else
  3439. klen = 0;
  3440. if (klen) {
  3441. delta += (klen - 1);
  3442. break;
  3443. } else {
  3444. tmp = get_canon_decomposition(tmp, &snd);
  3445. if (tmp) {
  3446. if (snd) {
  3447. delta++;
  3448. if (kompat) {
  3449. klen = get_kompat_decomposition(snd, &start);
  3450. if (klen)
  3451. delta += (klen - 1);
  3452. }
  3453. }
  3454. } else
  3455. break;
  3456. }
  3457. }
  3458. } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
  3459. && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
  3460. tmp = s[i];
  3461. tmp -= MZ_JAMO_SYLLABLE_START;
  3462. if (tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT)
  3463. delta += 2;
  3464. else
  3465. delta += 1;
  3466. }
  3467. }
  3468. s2 = (mzchar *)scheme_malloc_atomic((len + delta + 1) * sizeof(mzchar));
  3469. j = 0;
  3470. for (i = 0; i < len; i++) {
  3471. if (scheme_needs_decompose(s[i])) {
  3472. mzchar snd, tmp2;
  3473. int snds = 0, klen = 0, k;
  3474. GC_CAN_IGNORE unsigned short*start;
  3475. tmp = s[i];
  3476. while (scheme_needs_decompose(tmp)) {
  3477. if (kompat)
  3478. klen = get_kompat_decomposition(tmp, &start);
  3479. else
  3480. klen = 0;
  3481. if (klen) {
  3482. for (k = 0; k < klen; k++) {
  3483. s2[j++] = start[k];
  3484. }
  3485. break;
  3486. } else {
  3487. tmp2 = get_canon_decomposition(tmp, &snd);
  3488. if (tmp2) {
  3489. tmp = tmp2;
  3490. if (snd) {
  3491. if (kompat)
  3492. klen = get_kompat_decomposition(snd, &start);
  3493. else
  3494. klen = 0;
  3495. if (klen) {
  3496. snds += klen;
  3497. for (k = 0; k < klen; k++) {
  3498. s2[len + delta - snds + k] = start[k];
  3499. }
  3500. klen = 0;
  3501. } else {
  3502. snds++;
  3503. s2[len + delta - snds] = snd;
  3504. }
  3505. }
  3506. } else
  3507. break;
  3508. }
  3509. }
  3510. if (!klen)
  3511. s2[j++] = tmp;
  3512. memcpy(s2 + j, s2 + len + delta - snds, snds * sizeof(mzchar));
  3513. j += snds;
  3514. } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
  3515. && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
  3516. int l, v, t;
  3517. tmp = s[i];
  3518. tmp -= MZ_JAMO_SYLLABLE_START;
  3519. l = tmp / (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT);
  3520. v = (tmp % (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)) / MZ_JAMO_TRAILING_CONSONANT_COUNT;
  3521. t = tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT;
  3522. s2[j++] = MZ_JAMO_INITIAL_CONSONANT_START + l;
  3523. s2[j++] = MZ_JAMO_VOWEL_START + v;
  3524. if (t) {
  3525. s2[j++] = MZ_JAMO_TRAILING_CONSONANT_START + t;
  3526. }
  3527. } else {
  3528. s2[j++] = s[i];
  3529. }
  3530. }
  3531. s2[j] = 0;
  3532. len += delta;
  3533. /* Reorder pass: */
  3534. do {
  3535. swapped = 0;
  3536. for (i = 0; i < len; i++) {
  3537. if ((i + 1 < len)
  3538. && scheme_combining_class(s2[i])
  3539. && scheme_combining_class(s2[i+1])
  3540. && (scheme_combining_class(s2[i+1]) < scheme_combining_class(s2[i]))) {
  3541. /* Reorder and try again: */
  3542. tmp = s2[i + 1];
  3543. s2[i + 1] = s2[i];
  3544. s2[i] = tmp;
  3545. i--;
  3546. swapped = 1;
  3547. }
  3548. }
  3549. } while (swapped);
  3550. return scheme_make_sized_char_string(s2, len, 0);
  3551. }
  3552. static Scheme_Object *do_string_normalize_c (const char *who, int argc, Scheme_Object *argv[], int kompat)
  3553. {
  3554. Scheme_Object *o;
  3555. mzchar *s, last_c0 = 0, snd;
  3556. int len, i, last_cc = 0;
  3557. o = argv[0];
  3558. if (!SCHEME_CHAR_STRINGP(o))
  3559. scheme_wrong_type(who, "string", 0, argc, argv);
  3560. s = SCHEME_CHAR_STR_VAL(o);
  3561. len = SCHEME_CHAR_STRLEN_VAL(o);
  3562. for (i = 0; i < len; i++) {
  3563. if (scheme_needs_decompose(s[i])
  3564. && (kompat || get_canon_decomposition(s[i], &snd))) {
  3565. /* Decomposition may expose a different composition */
  3566. break;
  3567. } else if ((i + 1 < len)
  3568. && scheme_combining_class(s[i])
  3569. && scheme_combining_class(s[i+1])
  3570. && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
  3571. /* Need to reorder */
  3572. break;
  3573. } else if ((s[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
  3574. && (s[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
  3575. && (s[i+1] >= MZ_JAMO_VOWEL_START)
  3576. && (s[i+1] <= MZ_JAMO_VOWEL_END)) {
  3577. /* Need Hangul composition */
  3578. break;
  3579. } else if (last_c0
  3580. && get_composition(last_c0, s[i])
  3581. && (scheme_combining_class(s[i]) > last_cc)) {
  3582. /* Need to compose */
  3583. break;
  3584. } else {
  3585. int cc;
  3586. cc = scheme_combining_class(s[i]);
  3587. if (!cc) {
  3588. if (scheme_needs_maybe_compose(s[i]))
  3589. last_c0 = s[i];
  3590. else
  3591. last_c0 = 0;
  3592. last_cc = -1;
  3593. } else
  3594. last_cc = cc;
  3595. }
  3596. }
  3597. if (i < len) {
  3598. o = normalize_c(normalize_d(o, kompat));
  3599. }
  3600. return o;
  3601. }
  3602. static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[])
  3603. {
  3604. return do_string_normalize_c("string-normalize-nfc", argc, argv, 0);
  3605. }
  3606. static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[])
  3607. {
  3608. return do_string_normalize_c("string-normalize-nfkc", argc, argv, 1);
  3609. }
  3610. static Scheme_Object *do_string_normalize_d (const char *who, int argc, Scheme_Object *argv[], int kompat)
  3611. {
  3612. Scheme_Object *o;
  3613. mzchar *s;
  3614. int len, i;
  3615. o = argv[0];
  3616. if (!SCHEME_CHAR_STRINGP(o))
  3617. scheme_wrong_type(who, "string", 0, argc, argv);
  3618. s = SCHEME_CHAR_STR_VAL(o);
  3619. len = SCHEME_CHAR_STRLEN_VAL(o);
  3620. for (i = len; i--; ) {
  3621. if (scheme_needs_decompose(s[i])) {
  3622. /* Need to decompose */
  3623. mzchar snd;
  3624. if (kompat || get_canon_decomposition(s[i], &snd))
  3625. break;
  3626. } else if ((i + 1 < len)
  3627. && scheme_combining_class(s[i])
  3628. && scheme_combining_class(s[i+1])
  3629. && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
  3630. /* Need to reorder */
  3631. break;
  3632. } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
  3633. && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
  3634. /* Need Hangul decomposition */
  3635. break;
  3636. }
  3637. }
  3638. if (i >= 0) {
  3639. o = normalize_d(o, kompat);
  3640. }
  3641. return o;
  3642. }
  3643. static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[])
  3644. {
  3645. return do_string_normalize_d("string-normalize-nfd", argc, argv, 0);
  3646. }
  3647. static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[])
  3648. {
  3649. return do_string_normalize_d("string-normalize-nfkd", argc, argv, 1);
  3650. }
  3651. /**********************************************************************/
  3652. /* strcmps */
  3653. /**********************************************************************/
  3654. intptr_t scheme_char_strlen(const mzchar *s)
  3655. {
  3656. intptr_t i;
  3657. for (i = 0; s[i]; i++) {
  3658. }
  3659. return i;
  3660. }
  3661. static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
  3662. int use_locale, int size_shortcut)
  3663. {
  3664. intptr_t endres;
  3665. if (size_shortcut && (l1 != l2))
  3666. return 1;
  3667. #ifndef DONT_USE_LOCALE
  3668. if (use_locale) {
  3669. reset_locale();
  3670. if (locale_on) {
  3671. return do_locale_comp(who, str1, l1, str2, l2, 0);
  3672. }
  3673. }
  3674. #endif
  3675. if (l1 > l2) {
  3676. l1 = l2;
  3677. endres = 1;
  3678. } else {
  3679. if (l2 > l1)
  3680. endres = -1;
  3681. else
  3682. endres = 0;
  3683. }
  3684. while (l1--) {
  3685. unsigned int a, b;
  3686. a = *(str1++);
  3687. b = *(str2++);
  3688. a = a - b;
  3689. if (a)
  3690. return a;
  3691. }
  3692. return endres;
  3693. }
  3694. static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
  3695. int use_locale, int size_shortcut)
  3696. {
  3697. intptr_t p1, p2, sp1, sp2, a, b;
  3698. mzchar spec1[SPECIAL_CASE_FOLD_MAX], spec2[SPECIAL_CASE_FOLD_MAX];
  3699. if (size_shortcut && (l1 != l2))
  3700. return 1;
  3701. #ifndef DONT_USE_LOCALE
  3702. if (use_locale) {
  3703. reset_locale();
  3704. if (locale_on) {
  3705. return do_locale_comp(who, str1, l1, str2, l2, 1);
  3706. }
  3707. }
  3708. #endif
  3709. p1 = sp1 = 0;
  3710. p2 = sp2 = 0;
  3711. while (((p1 < l1) || sp1) && ((p2 < l2) || sp2)) {
  3712. if (sp1) {
  3713. a = spec1[--sp1];
  3714. } else {
  3715. a = str1[p1];
  3716. if (scheme_isspecialcasing(a)) {
  3717. int pos, i;
  3718. pos = find_special_casing(a);
  3719. sp1 = uchar_special_casings[pos + 7];
  3720. pos = uchar_special_casings[pos + 8];
  3721. for (i = sp1; i--; pos++) {
  3722. spec1[i] = uchar_special_casing_data[pos];
  3723. }
  3724. a = spec1[--sp1];
  3725. } else {
  3726. a = scheme_tofold(a);
  3727. }
  3728. p1++;
  3729. }
  3730. if (sp2) {
  3731. b = spec2[--sp2];
  3732. } else {
  3733. b = str2[p2];
  3734. if (scheme_isspecialcasing(b)) {
  3735. int pos, i;
  3736. pos = find_special_casing(b);
  3737. sp2 = uchar_special_casings[pos + 7];
  3738. pos = uchar_special_casings[pos + 8];
  3739. for (i = sp2; i--; pos++) {
  3740. spec2[i] = uchar_special_casing_data[pos];
  3741. }
  3742. b = spec2[--sp2];
  3743. } else {
  3744. b = scheme_tofold(b);
  3745. }
  3746. p2++;
  3747. }
  3748. a = a - b;
  3749. if (a)
  3750. return a;
  3751. }
  3752. return ((p1 < l1) || sp1) - ((p2 < l2) || sp2);
  3753. }
  3754. static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2)
  3755. {
  3756. intptr_t endres;
  3757. if (l1 > l2) {
  3758. l1 = l2;
  3759. endres = 1;
  3760. } else {
  3761. if (l2 > l1)
  3762. endres = -1;
  3763. else
  3764. endres = 0;
  3765. }
  3766. while (l1--) {
  3767. unsigned int a, b;
  3768. a = *(str1++);
  3769. b = *(str2++);
  3770. a = a - b;
  3771. if (a)
  3772. return a;
  3773. }
  3774. return endres;
  3775. }
  3776. /**********************************************************************/
  3777. /* byte string conversion */
  3778. /**********************************************************************/
  3779. static void close_converter(Scheme_Object *o, void *data)
  3780. {
  3781. Scheme_Converter *c = (Scheme_Converter *)o;
  3782. if (!c->closed) {
  3783. c->closed = 1;
  3784. if (c->kind == mzICONV_KIND) {
  3785. iconv_close(c->cd);
  3786. c->cd = (iconv_t)-1;
  3787. }
  3788. if (c->mref) {
  3789. scheme_remove_managed(c->mref, (Scheme_Object *)c);
  3790. c->mref = NULL;
  3791. }
  3792. }
  3793. }
  3794. Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
  3795. {
  3796. Scheme_Converter *c;
  3797. iconv_t cd;
  3798. int kind;
  3799. int permissive;
  3800. int need_regis = 1;
  3801. Scheme_Custodian_Reference *mref;
  3802. if (!iconv_ready) init_iconv();
  3803. if (!*to_e || !*from_e)
  3804. reset_locale();
  3805. if ((!strcmp(from_e, "UTF-8")
  3806. || !strcmp(from_e, "UTF-8-permissive")
  3807. || (!*from_e && mzLOCALE_IS_UTF_8(current_locale_name)))
  3808. && (!strcmp(to_e, "UTF-8")
  3809. || (!*to_e && mzLOCALE_IS_UTF_8(current_locale_name)))) {
  3810. /* Use the built-in UTF-8<->UTF-8 converter: */
  3811. kind = mzUTF8_KIND;
  3812. if (!strcmp(from_e, "UTF-8-permissive"))
  3813. permissive = 0xFFFD;
  3814. else
  3815. permissive = 0;
  3816. cd = (iconv_t)-1;
  3817. need_regis = (*to_e && *from_e);
  3818. } else if ((!strcmp(from_e, "platform-UTF-8")
  3819. || !strcmp(from_e, "platform-UTF-8-permissive"))
  3820. && !strcmp(to_e, "platform-UTF-16")) {
  3821. kind = mzUTF8_TO_UTF16_KIND;
  3822. if (!strcmp(from_e, "platform-UTF-8-permissive"))
  3823. permissive = 0xFFFD;
  3824. else
  3825. permissive = 0;
  3826. cd = (iconv_t)-1;
  3827. need_regis = 0;
  3828. } else if (!strcmp(from_e, "platform-UTF-16")
  3829. && !strcmp(to_e, "platform-UTF-8")) {
  3830. kind = mzUTF16_TO_UTF8_KIND;
  3831. permissive = 0;
  3832. cd = (iconv_t)-1;
  3833. need_regis = 0;
  3834. } else {
  3835. if (!iconv_ready) init_iconv();
  3836. if (!mzCHK_PROC(iconv_open))
  3837. return scheme_false;
  3838. if (!*from_e || !*to_e)
  3839. reset_locale();
  3840. if (!*from_e)
  3841. from_e = mz_iconv_nl_langinfo();
  3842. if (!*to_e)
  3843. to_e = mz_iconv_nl_langinfo();
  3844. cd = iconv_open(to_e, from_e);
  3845. if (cd == (iconv_t)-1)
  3846. return scheme_false;
  3847. kind = mzICONV_KIND;
  3848. permissive = 0;
  3849. }
  3850. c = MALLOC_ONE_TAGGED(Scheme_Converter);
  3851. c->so.type = scheme_string_converter_type;
  3852. c->closed = 0;
  3853. c->kind = kind;
  3854. c->permissive = permissive;
  3855. c->cd = cd;
  3856. if (!need_regis)
  3857. mref = NULL;
  3858. else
  3859. mref = scheme_add_managed(NULL,
  3860. (Scheme_Object *)c,
  3861. close_converter,
  3862. NULL, 1);
  3863. c->mref = mref;
  3864. return (Scheme_Object *)c;
  3865. }
  3866. static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object **argv)
  3867. {
  3868. Scheme_Object *s1, *s2;
  3869. char *from_e, *to_e;
  3870. if (!SCHEME_CHAR_STRINGP(argv[0]))
  3871. scheme_wrong_type("bytes-open-converter", "byte string", 0, argc, argv);
  3872. if (!SCHEME_CHAR_STRINGP(argv[1]))
  3873. scheme_wrong_type("bytes-open-converter", "byte string", 1, argc, argv);
  3874. scheme_custodian_check_available(NULL, "bytes-open-converter", "converter");
  3875. s1 = scheme_char_string_to_byte_string(argv[0]);
  3876. s2 = scheme_char_string_to_byte_string(argv[1]);
  3877. if (scheme_byte_string_has_null(s1))
  3878. return scheme_false;
  3879. if (scheme_byte_string_has_null(s2))
  3880. return scheme_false;
  3881. from_e = SCHEME_BYTE_STR_VAL(s1);
  3882. to_e = SCHEME_BYTE_STR_VAL(s2);
  3883. return scheme_open_converter(from_e, to_e);
  3884. }
  3885. static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Object *argv[])
  3886. {
  3887. char *r, *instr;
  3888. int status;
  3889. intptr_t amt_read, amt_wrote;
  3890. intptr_t istart, ifinish, ostart, ofinish;
  3891. Scheme_Object *a[3], *status_sym;
  3892. Scheme_Converter *c;
  3893. if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
  3894. scheme_wrong_type(who, "converter", 0, argc, argv);
  3895. if (opos > 1) {
  3896. if (!SCHEME_BYTE_STRINGP(argv[1]))
  3897. scheme_wrong_type(who, "bytes", 1, argc, argv);
  3898. scheme_get_substring_indices(who, argv[1], argc, argv, 2, 3, &istart, &ifinish);
  3899. } else {
  3900. istart = 0;
  3901. ifinish = 4; /* This is really a guess about how much space we need for a shift terminator */
  3902. }
  3903. if (argc > opos) {
  3904. if (SCHEME_TRUEP(argv[opos])) {
  3905. if (!SCHEME_MUTABLE_BYTE_STRINGP(argv[opos]))
  3906. scheme_wrong_type(who, "mutable byte string", opos, argc, argv);
  3907. r = SCHEME_BYTE_STR_VAL(argv[opos]);
  3908. scheme_get_substring_indices(who, argv[opos], argc, argv, opos + 1, opos + 2, &ostart, &ofinish);
  3909. } else {
  3910. int ip;
  3911. r = NULL;
  3912. for (ip = opos + 1; ip <= opos + 2; ip++) {
  3913. if (argc > ip) {
  3914. int ok = 0;
  3915. if (SCHEME_INTP(argv[ip]))
  3916. ok = SCHEME_INT_VAL(argv[ip]) >= 0;
  3917. else if (SCHEME_BIGNUMP(argv[ip]))
  3918. ok = SCHEME_BIGPOS(argv[ip]);
  3919. else if ((ip == opos + 2) && SCHEME_FALSEP(argv[ip]))
  3920. ok = 1;
  3921. if (!ok)
  3922. scheme_wrong_type(who,
  3923. ((ip == opos + 2)
  3924. ? "non-negative exact integer or #f"
  3925. : "non-negative exact integer"),
  3926. ip, argc, argv);
  3927. }
  3928. }
  3929. if ((argc > opos + 2) && SCHEME_TRUEP(argv[opos + 2])) {
  3930. Scheme_Object *delta;
  3931. if (scheme_bin_lt(argv[opos + 2], argv[opos + 1])) {
  3932. scheme_arg_mismatch(who,
  3933. "ending index is less than the starting index: ",
  3934. argv[opos + 2]);
  3935. }
  3936. delta = scheme_bin_minus(argv[opos + 2], argv[opos + 1]);
  3937. if (SCHEME_BIGNUMP(delta))
  3938. ofinish = -1;
  3939. else
  3940. ofinish = SCHEME_INT_VAL(delta);
  3941. ostart = 0;
  3942. } else {
  3943. ostart = 0;
  3944. ofinish = -1;
  3945. }
  3946. }
  3947. } else {
  3948. r = NULL;
  3949. ostart = 0;
  3950. ofinish = -1;
  3951. }
  3952. c = (Scheme_Converter *)argv[0];
  3953. if (c->closed)
  3954. scheme_arg_mismatch(who, "converter is closed: ", argv[0]);
  3955. instr = ((opos > 1) ? SCHEME_BYTE_STR_VAL(argv[1]) : NULL);
  3956. if (c->kind == mzUTF16_TO_UTF8_KIND) {
  3957. if (istart & 0x1) {
  3958. /* Copy to word-align */
  3959. char *c2;
  3960. c2 = (char *)scheme_malloc_atomic(ifinish - istart);
  3961. memcpy(c2, instr XFORM_OK_PLUS istart, ifinish - istart);
  3962. ifinish = ifinish - istart;
  3963. istart = 0;
  3964. instr = c2;
  3965. }
  3966. status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
  3967. (unsigned char *)r, ostart, ofinish,
  3968. &amt_read, &amt_wrote, 1);
  3969. amt_read -= (istart >> 1);
  3970. if (amt_read) {
  3971. if (!r) {
  3972. /* Need to allocate, then do it again: */
  3973. r = (char *)scheme_malloc_atomic(amt_wrote + 1);
  3974. utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
  3975. (unsigned char *)r, ostart, ofinish,
  3976. NULL, NULL, 1);
  3977. r[amt_wrote] = 0;
  3978. }
  3979. amt_read <<= 1;
  3980. }
  3981. /* We might get a -1 result because the input has an odd number of
  3982. bytes, and 2nd+next-to-last bytes form an unpaired
  3983. surrogate. In that case, the transformer normally needs one
  3984. more byte: Windows is little-endian, so we need the byte to
  3985. tell whether the surrogate is paired, and for all other
  3986. platforms (where we assume that surrogates are paired), we need
  3987. the byte to generate output. Technically, on a big-endian
  3988. non-Windows machine, we could generate the first byte of UTF-8
  3989. output and keep the byte as state, but we don't. */
  3990. if (status != -1) {
  3991. if (amt_read < ((ifinish - istart) & ~0x1)) {
  3992. /* Must have run out of output space */
  3993. status = 1;
  3994. } else {
  3995. /* Read all of input --- but it wasn't really all if there
  3996. was an odd number of bytes. */
  3997. if ((ifinish - istart) & 0x1)
  3998. status = -1;
  3999. else
  4000. status = 0;
  4001. }
  4002. }
  4003. } else if (c->kind != mzICONV_KIND) {
  4004. /* UTF-8 -> UTF-{8,16} "identity" converter, but maybe permissive */
  4005. if (instr) {
  4006. intptr_t _ostart, _ofinish;
  4007. int utf16;
  4008. if (c->kind == mzUTF8_TO_UTF16_KIND) {
  4009. _ostart = ostart;
  4010. _ofinish = ofinish;
  4011. if (_ostart & 0x1)
  4012. _ostart++;
  4013. _ostart >>= 1;
  4014. if (_ofinish > 0)
  4015. _ofinish >>= 1;
  4016. utf16 = 1;
  4017. } else {
  4018. _ostart = ostart;
  4019. _ofinish = ofinish;
  4020. utf16 = 0;
  4021. }
  4022. status = utf8_decode_x((unsigned char *)instr, istart, ifinish,
  4023. (unsigned int *)r, _ostart, _ofinish,
  4024. &amt_read, &amt_wrote,
  4025. 1, utf16, NULL, 1, c->permissive);
  4026. if (utf16) {
  4027. _ostart <<= 1;
  4028. amt_wrote <<= 1;
  4029. if ((ostart & 0x1) && (amt_wrote > _ostart)) {
  4030. /* Shift down one byte: */
  4031. memmove(r XFORM_OK_PLUS ostart, r XFORM_OK_PLUS _ostart, amt_wrote - _ostart);
  4032. }
  4033. }
  4034. amt_read -= istart;
  4035. amt_wrote -= _ostart;
  4036. if (status == -3) {
  4037. /* r is not NULL; ran out of room */
  4038. status = 1;
  4039. } else {
  4040. if (amt_wrote) {
  4041. if (!r) {
  4042. /* Need to allocate, then do it again: */
  4043. r = (char *)scheme_malloc_atomic(amt_wrote + 1);
  4044. utf8_decode_x((unsigned char *)instr, istart, ifinish,
  4045. (unsigned int *)r, ostart, _ofinish,
  4046. NULL, NULL,
  4047. 1, utf16, NULL, 1, c->permissive);
  4048. r[amt_wrote] = 0;
  4049. }
  4050. } else if (!r)
  4051. r = "";
  4052. if (status > 0)
  4053. status = 0;
  4054. }
  4055. } else {
  4056. r = "";
  4057. status = 0;
  4058. amt_read = 0;
  4059. amt_wrote = 0;
  4060. }
  4061. } else {
  4062. r = do_convert(c->cd, NULL, NULL, 0,
  4063. instr, istart, ifinish-istart,
  4064. r, ostart, ofinish-ostart,
  4065. !r, /* grow? */
  4066. 0,
  4067. (r ? 0 : 1), /* terminator */
  4068. &amt_read, &amt_wrote,
  4069. &status);
  4070. }
  4071. if (status == 0) {
  4072. /* Converted all input without error */
  4073. status_sym = complete_symbol;
  4074. } else if (status == 1) {
  4075. /* Filled output, more input ready */
  4076. status_sym = continues_symbol;
  4077. } else if (status == -1) {
  4078. /* Input ends in the middle of an encoding */
  4079. status_sym = aborts_symbol;
  4080. } else {
  4081. /* Assert: status == -2 */
  4082. /* Input has error (that won't be fixed by
  4083. adding more characters */
  4084. status_sym = error_symbol;
  4085. }
  4086. if (argc <= opos) {
  4087. a[0] = scheme_make_sized_byte_string(r, amt_wrote, 0);
  4088. } else {
  4089. a[0] = scheme_make_integer(amt_wrote);
  4090. }
  4091. if (opos > 1) {
  4092. a[1] = scheme_make_integer(amt_read);
  4093. a[2] = status_sym;
  4094. return scheme_values(3, a);
  4095. } else {
  4096. a[1] = status_sym;
  4097. return scheme_values(2, a);
  4098. }
  4099. }
  4100. static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[])
  4101. {
  4102. return convert_one("bytes-convert", 4, argc, argv);
  4103. }
  4104. static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[])
  4105. {
  4106. return convert_one("bytes-convert-end", 1, argc, argv);
  4107. }
  4108. void scheme_close_converter(Scheme_Object *conv)
  4109. {
  4110. close_converter(conv, NULL);
  4111. }
  4112. static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object **argv)
  4113. {
  4114. if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
  4115. scheme_wrong_type("bytes-close-converter", "converter", 0, argc, argv);
  4116. scheme_close_converter(argv[0]);
  4117. return scheme_void;
  4118. }
  4119. static Scheme_Object *
  4120. byte_converter_p(int argc, Scheme_Object *argv[])
  4121. {
  4122. return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type)
  4123. ? scheme_true
  4124. : scheme_false);
  4125. }
  4126. /**********************************************************************/
  4127. /* utf8 converter */
  4128. /**********************************************************************/
  4129. static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
  4130. unsigned int *us, intptr_t dstart, intptr_t dend,
  4131. intptr_t *ipos, intptr_t *jpos,
  4132. char compact, char utf16, int *_state,
  4133. int might_continue, int permissive)
  4134. /* Results:
  4135. non-negative => translation complete, = number of produced chars
  4136. -1 => input ended in middle of encoding (only if might_continue)
  4137. -2 => encoding error (only if permissive is 0)
  4138. -3 => not enough output room
  4139. ipos & jpos are filled with ending positions (between [d]start
  4140. and [d]end) before return, unless they are NULL.
  4141. compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16
  4142. for Windows for utf16, decode extended UTF-8 that allows surrogates
  4143. _state provides initial state and is filled with ending state;
  4144. when it's not NULL, the us must be NULL
  4145. might_continue => allows -1 result without consuming characters
  4146. permissive is non-zero => use permissive as value for bad byte
  4147. sequences. When generating UTF-8, this must be an ASCII character
  4148. or U+FFFD. */
  4149. {
  4150. intptr_t i, j, oki;
  4151. int failmode = -3, state;
  4152. int init_doki;
  4153. int nextbits, v;
  4154. unsigned int sc;
  4155. # ifdef WINDOWS_UNICODE_SUPPORT
  4156. int pending_surrogate = 0;
  4157. # endif
  4158. if (_state) {
  4159. state = (*_state) & 0x7;
  4160. init_doki = (((*_state) >> 3) & 0x7);
  4161. nextbits = ((((*_state) >> 6) & 0xF) << 2);
  4162. /* Need v to detect 0xD800 through 0xDFFF
  4163. Note that we have 22 bits to work with, which is
  4164. is enough to detect > 0x10FFFF */
  4165. v = ((*_state) >> 10);
  4166. } else {
  4167. state = 0;
  4168. init_doki = 0;
  4169. nextbits = 0;
  4170. v = 0;
  4171. }
  4172. /* In non-permissive mode, a negative result means ill-formed input.
  4173. Permissive mode accepts anything and tries to convert it. In
  4174. that case, the strategy for illegal sequences is to convert
  4175. anything bad to the given "permissive" value. */
  4176. if (end < 0)
  4177. end = strlen((char *)s);
  4178. if (dend < 0)
  4179. dend = 0x7FFFFFFF;
  4180. # define ENCFAIL i = oki; failmode = -2; break
  4181. oki = start;
  4182. j = dstart;
  4183. i = start;
  4184. if (j < dend) {
  4185. while (i < end) {
  4186. sc = s[i];
  4187. if (sc < 0x80) {
  4188. if (state) {
  4189. /* In a sequence, but didn't continue */
  4190. state = 0;
  4191. nextbits = 0;
  4192. if (permissive) {
  4193. v = permissive;
  4194. i = oki;
  4195. j += init_doki;
  4196. } else {
  4197. ENCFAIL;
  4198. }
  4199. } else {
  4200. v = sc;
  4201. }
  4202. } else if ((sc & 0xC0) == 0x80) {
  4203. /* Continues a sequence ... */
  4204. if (state) {
  4205. /* ... and we're in one ... */
  4206. if (!nextbits || (sc & nextbits)) {
  4207. /* and we have required bits. */
  4208. v = (v << 6) + (sc & 0x3F);
  4209. nextbits = 0;
  4210. --state;
  4211. if (state) {
  4212. i++;
  4213. continue;
  4214. }
  4215. /* We finished. One last check: */
  4216. if ((((v >= 0xD800) && (v <= 0xDFFF))
  4217. || (v > 0x10FFFF))
  4218. # ifdef WINDOWS_UNICODE_SUPPORT
  4219. && (!utf16
  4220. /* If UTF-16 for Windows, just apply upper-limit check */
  4221. || (v > 0x10FFFF))
  4222. # endif
  4223. ) {
  4224. /* UTF-16 surrogates or other illegal code units */
  4225. if (permissive) {
  4226. v = permissive;
  4227. j += init_doki;
  4228. i = oki;
  4229. } else {
  4230. ENCFAIL;
  4231. }
  4232. }
  4233. } else {
  4234. /* ... but we're missing required bits. */
  4235. state = 0;
  4236. nextbits = 0;
  4237. if (permissive) {
  4238. v = permissive;
  4239. j += init_doki;
  4240. i = oki;
  4241. } else {
  4242. ENCFAIL;
  4243. }
  4244. }
  4245. } else {
  4246. /* ... but we're not in one */
  4247. if (permissive) {
  4248. v = permissive;
  4249. } else {
  4250. ENCFAIL;
  4251. }
  4252. }
  4253. } else if (state) {
  4254. /* bad: already in a sequence */
  4255. state = 0;
  4256. if (permissive) {
  4257. v = permissive;
  4258. i = oki;
  4259. j += init_doki;
  4260. } else {
  4261. ENCFAIL;
  4262. }
  4263. } else {
  4264. if ((sc & 0xE0) == 0xC0) {
  4265. if (sc & 0x1E) {
  4266. state = 1;
  4267. v = (sc & 0x1F);
  4268. i++;
  4269. continue;
  4270. }
  4271. /* else too small */
  4272. } else if ((sc & 0xF0) == 0xE0) {
  4273. state = 2;
  4274. v = (sc & 0xF);
  4275. if (!v)
  4276. nextbits = 0x20;
  4277. i++;
  4278. continue;
  4279. } else if ((sc & 0xF8) == 0xF0) {
  4280. v = (sc & 0x7);
  4281. if (v <= 4) {
  4282. state = 3;
  4283. if (!v)
  4284. nextbits = 0x30;
  4285. i++;
  4286. continue;
  4287. }
  4288. /* Else will be larger than 0x10FFFF, so fail */
  4289. }
  4290. /* Too small, or 0xFF or 0xFe, or start of a 5- or 6-byte sequence */
  4291. if (permissive) {
  4292. v = permissive;
  4293. } else {
  4294. ENCFAIL;
  4295. }
  4296. }
  4297. /* If we get here, we're supposed to output v */
  4298. if (compact) {
  4299. if (utf16) {
  4300. if (v > 0xFFFF) {
  4301. # ifdef WINDOWS_UNICODE_SUPPORT
  4302. if (pending_surrogate) {
  4303. if (us)
  4304. ((unsigned short *)us)[j] = pending_surrogate;
  4305. j++; /* Accept previously written unpaired surrogate */
  4306. pending_surrogate = 0;
  4307. }
  4308. # endif
  4309. if (j + 1 >= dend)
  4310. break;
  4311. if (us) {
  4312. v -= 0x10000;
  4313. ((unsigned short *)us)[j] = 0xD800 | ((v >> 10) & 0x3FF);
  4314. ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF);
  4315. }
  4316. j++;
  4317. } else {
  4318. # ifdef WINDOWS_UNICODE_SUPPORT
  4319. /* We allow a surrogate by itself, but don't allow
  4320. a 0xDC00 after a 0xD800, otherwise multiple encodings can
  4321. map to the same thing. */
  4322. if ((v >= 0xD800) && (v <= 0xDFFF)) {
  4323. if (pending_surrogate && ((v & 0xDC00) == 0xDC00)) {
  4324. /* This looks like a surrogate pair, so disallow it. */
  4325. if (permissive) {
  4326. /* We need to fill in 6 permissive substitutions,
  4327. one for each input byte. If we can't put all 6,
  4328. then don't use any input. */
  4329. if (j + 5 >= dend) {
  4330. break;
  4331. } else {
  4332. int p;
  4333. if (us) {
  4334. for (p = 0; p < 5; p++) {
  4335. if (j + p >= dend)
  4336. break;
  4337. ((unsigned short *)us)[j+p] = permissive;
  4338. }
  4339. }
  4340. j += 5;
  4341. v = permissive;
  4342. }
  4343. } else {
  4344. ENCFAIL;
  4345. }
  4346. pending_surrogate = 0;
  4347. } else {
  4348. if (pending_surrogate) {
  4349. if (us)
  4350. ((unsigned short *)us)[j] = pending_surrogate;
  4351. j++; /* Accept previousy written unpaired surrogate */
  4352. pending_surrogate = 0;
  4353. if (j >= dend)
  4354. break;
  4355. }
  4356. if ((v & 0xDC00) == 0xD800)
  4357. pending_surrogate = v;
  4358. else
  4359. pending_surrogate = 0;
  4360. }
  4361. } else {
  4362. if (pending_surrogate) {
  4363. if (us)
  4364. ((unsigned short *)us)[j] = pending_surrogate;
  4365. j++; /* Accept previousy written unpaired surrogate */
  4366. pending_surrogate = 0;
  4367. if (j >= dend)
  4368. break;
  4369. }
  4370. }
  4371. if (pending_surrogate)
  4372. --j; /* don't accept unpaired surrogate, yet */
  4373. else if (us)
  4374. ((unsigned short *)us)[j] = v;
  4375. # else
  4376. if (us)
  4377. ((unsigned short *)us)[j] = v;
  4378. # endif
  4379. }
  4380. } else {
  4381. intptr_t delta;
  4382. delta = (i - oki);
  4383. if (delta) {
  4384. if (j + delta + 1 < dend) {
  4385. if (us)
  4386. memcpy(((char *)us) + j, s + oki, delta + 1);
  4387. j += delta;
  4388. } else
  4389. break;
  4390. } else if (v == 0xFFFD) {
  4391. if (j + 3 < dend) {
  4392. if (us) {
  4393. ((unsigned char *)us)[j] = 0xEF;
  4394. ((unsigned char *)us)[j+1] = 0xBF;
  4395. ((unsigned char *)us)[j+2] = 0xBD;
  4396. }
  4397. j += 2;
  4398. } else
  4399. break;
  4400. } else if (us) {
  4401. ((unsigned char *)us)[j] = v;
  4402. }
  4403. }
  4404. } else if (us) {
  4405. us[j] = v;
  4406. }
  4407. j++;
  4408. i++;
  4409. oki = i;
  4410. init_doki = 0;
  4411. if (j >= dend)
  4412. break;
  4413. }
  4414. }
  4415. if (_state) {
  4416. if (!state)
  4417. *_state = 0;
  4418. else
  4419. *_state = (state
  4420. | (((end - oki) + init_doki) << 3)
  4421. | ((nextbits >> 2) << 6)
  4422. | (v << 10));
  4423. } else if (state) {
  4424. if (might_continue || !permissive) {
  4425. failmode = -1;
  4426. i = end - 1; /* to ensure that failmode is returned */
  4427. } else if (permissive) {
  4428. for (i = oki; i < end; i++) {
  4429. if (j < dend) {
  4430. if (us) {
  4431. if (compact) {
  4432. if (utf16)
  4433. ((unsigned short *)us)[j] = permissive;
  4434. else
  4435. ((unsigned char *)us)[j] = permissive;
  4436. } else
  4437. us[j] = permissive;
  4438. }
  4439. j++;
  4440. } else
  4441. break;
  4442. }
  4443. oki = i;
  4444. }
  4445. }
  4446. # ifdef WINDOWS_UNICODE_SUPPORT
  4447. if (pending_surrogate)
  4448. oki -= 3;
  4449. #endif
  4450. if (ipos)
  4451. *ipos = oki;
  4452. if (jpos)
  4453. *jpos = j;
  4454. if (i < end)
  4455. return failmode;
  4456. # ifdef WINDOWS_UNICODE_SUPPORT
  4457. if (pending_surrogate) {
  4458. /* input must have ended right after surrogate */
  4459. return -1;
  4460. }
  4461. #endif
  4462. return j - dstart;
  4463. }
  4464. intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end,
  4465. unsigned int *us, intptr_t dstart, intptr_t dend,
  4466. intptr_t *ipos, char utf16, int permissive)
  4467. {
  4468. return utf8_decode_x(s, start, end, us, dstart, dend,
  4469. ipos, NULL, utf16, utf16, NULL, 0, permissive);
  4470. }
  4471. intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end,
  4472. unsigned int *us, intptr_t dstart, intptr_t dend,
  4473. intptr_t *ipos, char utf16, int permissive)
  4474. /* Always returns number of read characters, not error codes. */
  4475. {
  4476. intptr_t opos;
  4477. utf8_decode_x(s, start, end, us, dstart, dend,
  4478. ipos, &opos, utf16, utf16, NULL, 1, permissive);
  4479. return opos - dstart;
  4480. }
  4481. intptr_t scheme_utf8_decode_all(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
  4482. {
  4483. return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive);
  4484. }
  4485. intptr_t scheme_utf8_decode_prefix(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
  4486. /* us != NULL */
  4487. {
  4488. {
  4489. /* Try fast path (all ASCII) */
  4490. intptr_t i;
  4491. for (i = 0; i < len; i++) {
  4492. if (s[i] < 128)
  4493. us[i] = s[i];
  4494. else
  4495. break;
  4496. }
  4497. if (i == len)
  4498. return len;
  4499. }
  4500. return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive);
  4501. }
  4502. mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len,
  4503. mzchar *buf, intptr_t blen, intptr_t *_ulen)
  4504. {
  4505. intptr_t ulen;
  4506. ulen = utf8_decode_x(s, 0, len, NULL, 0, -1,
  4507. NULL, NULL, 0, 0,
  4508. NULL, 0, 0);
  4509. if (ulen < 0)
  4510. return NULL;
  4511. if (ulen + 1 > blen) {
  4512. buf = (mzchar *)scheme_malloc_atomic((ulen + 1) * sizeof(mzchar));
  4513. }
  4514. utf8_decode_x(s, 0, len, buf, 0, -1,
  4515. NULL, NULL, 0, 0,
  4516. NULL, 0, 0);
  4517. buf[ulen] = 0;
  4518. *_ulen = ulen;
  4519. return buf;
  4520. }
  4521. mzchar *scheme_utf8_decode_to_buffer(const unsigned char *s, intptr_t len,
  4522. mzchar *buf, intptr_t blen)
  4523. {
  4524. intptr_t ulen;
  4525. return scheme_utf8_decode_to_buffer_len(s, len, buf, blen, &ulen);
  4526. }
  4527. intptr_t scheme_utf8_decode_count(const unsigned char *s, intptr_t start, intptr_t end,
  4528. int *_state, int might_continue, int permissive)
  4529. {
  4530. intptr_t pos = 0;
  4531. if (!_state || !*_state) {
  4532. /* Try fast path (all ASCII): */
  4533. intptr_t i;
  4534. for (i = start; i < end; i++) {
  4535. if (s[i] > 127)
  4536. break;
  4537. }
  4538. if (i == end)
  4539. return end - start;
  4540. }
  4541. utf8_decode_x(s, start, end,
  4542. NULL, 0, -1,
  4543. NULL, &pos,
  4544. 0, 0, _state,
  4545. might_continue, permissive);
  4546. return pos;
  4547. }
  4548. static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
  4549. unsigned char *s, intptr_t dstart, intptr_t dend,
  4550. intptr_t *_ipos, intptr_t *_opos, char utf16)
  4551. /* Results:
  4552. -1 => input ended in the middle of an encoding - only when utf16 and _opos
  4553. non-negative => reports number of bytes/code-units produced */
  4554. {
  4555. intptr_t i, j, done = start;
  4556. if (dend < 0)
  4557. dend = 0x7FFFFFFF;
  4558. if (!s) {
  4559. unsigned int wc;
  4560. j = 0;
  4561. for (i = start; i < end; i++) {
  4562. if (utf16) {
  4563. wc = ((unsigned short *)us)[i];
  4564. if ((wc & 0xF800) == 0xD800) {
  4565. /* Unparse surrogates. We assume that the surrogates are
  4566. well formed, unless this is Windows or if we're at the
  4567. end and _opos is 0. */
  4568. # ifdef WINDOWS_UNICODE_SUPPORT
  4569. # define UNPAIRED_MASK 0xFC00
  4570. # else
  4571. # define UNPAIRED_MASK 0xF800
  4572. # endif
  4573. if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
  4574. /* Ended in the middle of a surrogate pair */
  4575. *_opos = j;
  4576. if (_ipos)
  4577. *_ipos = i;
  4578. return -1;
  4579. }
  4580. # ifdef WINDOWS_UNICODE_SUPPORT
  4581. if ((wc & 0xFC00) != 0xD800) {
  4582. /* Count as one */
  4583. } else if ((i + 1 >= end)
  4584. || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) {
  4585. } else
  4586. # endif
  4587. {
  4588. i++;
  4589. wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
  4590. wc += 0x10000;
  4591. }
  4592. }
  4593. } else {
  4594. wc = us[i];
  4595. }
  4596. if (wc < 0x80) {
  4597. j += 1;
  4598. } else if (wc < 0x800) {
  4599. j += 2;
  4600. } else if (wc < 0x10000) {
  4601. j += 3;
  4602. } else if (wc < 0x200000) {
  4603. j += 4;
  4604. } else if (wc < 0x4000000) {
  4605. j += 5;
  4606. } else {
  4607. j += 6;
  4608. }
  4609. }
  4610. if (_ipos)
  4611. *_ipos = i;
  4612. if (_opos)
  4613. *_opos = j + dstart;
  4614. return j;
  4615. } else {
  4616. unsigned int wc;
  4617. j = dstart;
  4618. for (i = start; i < end; i++) {
  4619. if (utf16) {
  4620. wc = ((unsigned short *)us)[i];
  4621. if ((wc & 0xF800) == 0xD800) {
  4622. /* Unparse surrogates. We assume that the surrogates are
  4623. well formed on non-Windows platforms, but when _opos,
  4624. we detect ending in the middle of an surrogate pair. */
  4625. if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
  4626. /* Ended in the middle of a surrogate pair */
  4627. *_opos = j;
  4628. if (_ipos)
  4629. *_ipos = i;
  4630. return -1;
  4631. }
  4632. # ifdef WINDOWS_UNICODE_SUPPORT
  4633. if ((wc & 0xFC00) != 0xD800) {
  4634. /* Let the misplaced surrogate through */
  4635. } else if ((i + 1 >= end)
  4636. || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00)) {
  4637. /* Let the misplaced surrogate through */
  4638. } else
  4639. # endif
  4640. {
  4641. i++;
  4642. wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
  4643. wc += 0x10000;
  4644. }
  4645. }
  4646. } else {
  4647. wc = us[i];
  4648. }
  4649. if (wc < 0x80) {
  4650. if (j + 1 > dend)
  4651. break;
  4652. s[j++] = wc;
  4653. } else if (wc < 0x800) {
  4654. if (j + 2 > dend)
  4655. break;
  4656. s[j++] = 0xC0 | ((wc & 0x7C0) >> 6);
  4657. s[j++] = 0x80 | (wc & 0x3F);
  4658. } else if (wc < 0x10000) {
  4659. if (j + 3 > dend)
  4660. break;
  4661. s[j++] = 0xE0 | ((wc & 0xF000) >> 12);
  4662. s[j++] = 0x80 | ((wc & 0x0FC0) >> 6);
  4663. s[j++] = 0x80 | (wc & 0x3F);
  4664. } else if (wc < 0x200000) {
  4665. if (j + 4 > dend)
  4666. break;
  4667. s[j++] = 0xF0 | ((wc & 0x1C0000) >> 18);
  4668. s[j++] = 0x80 | ((wc & 0x03F000) >> 12);
  4669. s[j++] = 0x80 | ((wc & 0x000FC0) >> 6);
  4670. s[j++] = 0x80 | (wc & 0x3F);
  4671. } else if (wc < 0x4000000) {
  4672. if (j + 5 > dend)
  4673. break;
  4674. s[j++] = 0xF8 | ((wc & 0x3000000) >> 24);
  4675. s[j++] = 0x80 | ((wc & 0x0FC0000) >> 18);
  4676. s[j++] = 0x80 | ((wc & 0x003F000) >> 12);
  4677. s[j++] = 0x80 | ((wc & 0x0000FC0) >> 6);
  4678. s[j++] = 0x80 | (wc & 0x3F);
  4679. } else {
  4680. if (j + 6 > dend)
  4681. break;
  4682. s[j++] = 0xFC | ((wc & 0x40000000) >> 30);
  4683. s[j++] = 0x80 | ((wc & 0x3F000000) >> 24);
  4684. s[j++] = 0x80 | ((wc & 0x00FC0000) >> 18);
  4685. s[j++] = 0x80 | ((wc & 0x0003F000) >> 12);
  4686. s[j++] = 0x80 | ((wc & 0x00000FC0) >> 6);
  4687. s[j++] = 0x80 | (wc & 0x3F);
  4688. }
  4689. done = i;
  4690. }
  4691. if (_ipos)
  4692. *_ipos = done;
  4693. if (_opos)
  4694. *_opos = j;
  4695. return j - dstart;
  4696. }
  4697. }
  4698. intptr_t scheme_utf8_encode(const unsigned int *us, intptr_t start, intptr_t end,
  4699. unsigned char *s, intptr_t dstart,
  4700. char utf16)
  4701. {
  4702. return utf8_encode_x(us, start, end,
  4703. s, dstart, -1,
  4704. NULL, NULL, utf16);
  4705. }
  4706. intptr_t scheme_utf8_encode_all(const unsigned int *us, intptr_t len, unsigned char *s)
  4707. {
  4708. return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */);
  4709. }
  4710. char *scheme_utf8_encode_to_buffer_len(const mzchar *s, intptr_t len,
  4711. char *buf, intptr_t blen,
  4712. intptr_t *_slen)
  4713. {
  4714. intptr_t slen;
  4715. /* ASCII with len < blen is a common case: */
  4716. if (len < blen) {
  4717. for (slen = 0; slen < len; slen++) {
  4718. if (s[slen] > 127)
  4719. break;
  4720. else
  4721. buf[slen] = s[slen];
  4722. }
  4723. if (slen == len) {
  4724. buf[slen] = 0;
  4725. *_slen = slen;
  4726. return buf;
  4727. }
  4728. }
  4729. slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0);
  4730. if (slen + 1 > blen) {
  4731. buf = (char *)scheme_malloc_atomic(slen + 1);
  4732. }
  4733. utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0);
  4734. buf[slen] = 0;
  4735. *_slen = slen;
  4736. return buf;
  4737. }
  4738. char *scheme_utf8_encode_to_buffer(const mzchar *s, intptr_t len,
  4739. char *buf, intptr_t blen)
  4740. {
  4741. intptr_t slen;
  4742. return scheme_utf8_encode_to_buffer_len(s, len, buf, blen, &slen);
  4743. }
  4744. unsigned short *scheme_ucs4_to_utf16(const mzchar *text, intptr_t start, intptr_t end,
  4745. unsigned short *buf, intptr_t bufsize,
  4746. intptr_t *ulen, intptr_t term_size)
  4747. {
  4748. mzchar v;
  4749. intptr_t extra, i, j;
  4750. unsigned short *utf16;
  4751. /* Count characters that fall outside UCS-2: */
  4752. for (i = start, extra = 0; i < end; i++) {
  4753. if (text[i] > 0xFFFF)
  4754. extra++;
  4755. }
  4756. if ((end - start) + extra + term_size < bufsize)
  4757. utf16 = buf;
  4758. else
  4759. utf16 = (unsigned short *)scheme_malloc_atomic(sizeof(unsigned short) * ((end - start) + extra + term_size));
  4760. for (i = start, j = 0; i < end; i++) {
  4761. v = text[i];
  4762. if (v > 0xFFFF) {
  4763. utf16[j++] = 0xD800 | ((v >> 10) & 0x3FF);
  4764. utf16[j++] = 0xDC00 | (v & 0x3FF);
  4765. } else
  4766. utf16[j++] = v;
  4767. }
  4768. *ulen = j;
  4769. return utf16;
  4770. }
  4771. mzchar *scheme_utf16_to_ucs4(const unsigned short *text, intptr_t start, intptr_t end,
  4772. mzchar *buf, intptr_t bufsize,
  4773. intptr_t *ulen, intptr_t term_size)
  4774. {
  4775. int wc;
  4776. intptr_t i, j;
  4777. for (i = start, j = 0; i < end; i++) {
  4778. wc = text[i];
  4779. if ((wc & 0xF800) == 0xD800) {
  4780. i++;
  4781. }
  4782. j++;
  4783. }
  4784. if (j + term_size >= bufsize)
  4785. buf = (mzchar *)scheme_malloc_atomic((j + term_size) * sizeof(mzchar));
  4786. for (i = start, j = 0; i < end; i++) {
  4787. wc = text[i];
  4788. if ((wc & 0xF800) == 0xD800) {
  4789. i++;
  4790. wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)text)[i]) & 0x3FF);
  4791. wc += 0x10000;
  4792. }
  4793. buf[j++] = wc;
  4794. }
  4795. *ulen = j;
  4796. return buf;
  4797. }
  4798. /**********************************************************************/
  4799. /* machine type details */
  4800. /**********************************************************************/
  4801. /**************************** MacOS ***********************************/
  4802. #if defined(MACINTOSH_EVENTS) && !defined(OS_X)
  4803. # include <Gestalt.h>
  4804. extern intptr_t scheme_this_ip(void);
  4805. static void machine_details(char *s)
  4806. {
  4807. OSErr err;
  4808. intptr_t lng;
  4809. char sysvers[30];
  4810. char machine_name[256];
  4811. err = Gestalt(gestaltSystemVersion, &lng);
  4812. if (err != noErr) {
  4813. strcpy(sysvers, "<unknown system>");
  4814. } else {
  4815. int i;
  4816. sprintf(sysvers, "%X.%X",
  4817. (lng >> 8) & 0xff,
  4818. lng & 0xff);
  4819. /* remove trailing zeros, put dot before something else */
  4820. i = strlen(sysvers);
  4821. if (i > 1) {
  4822. if (sysvers[i-1] != '.') {
  4823. if (sysvers[i-1] == '0') {
  4824. sysvers[i-1] = 0;
  4825. i--;
  4826. } else {
  4827. sysvers[i] = sysvers[i-1];
  4828. sysvers[i-1] = '.';
  4829. i++;
  4830. sysvers[i] = 0;
  4831. }
  4832. }
  4833. }
  4834. }
  4835. err = Gestalt(gestaltMachineType, &lng);
  4836. if (err != noErr) {
  4837. strcpy(machine_name, "<unknown machine>");
  4838. } else {
  4839. Str255 machine_name_pascal;
  4840. GetIndString(machine_name_pascal, kMachineNameStrID, lng);
  4841. CopyPascalStringToC(machine_name_pascal, machine_name);
  4842. }
  4843. lng = scheme_this_ip();
  4844. sprintf(s, "%s %s %d.%d.%d.%d", sysvers, machine_name,
  4845. ((unsigned char *)&lng)[0],
  4846. ((unsigned char *)&lng)[1],
  4847. ((unsigned char *)&lng)[2],
  4848. ((unsigned char *)&lng)[3]);
  4849. }
  4850. #endif
  4851. /*************************** Windows **********************************/
  4852. #ifdef DOS_FILE_SYSTEM
  4853. # include <windows.h>
  4854. void machine_details(char *buff)
  4855. {
  4856. OSVERSIONINFO info;
  4857. BOOL hasInfo;
  4858. char *p;
  4859. info.dwOSVersionInfoSize = sizeof(info);
  4860. GetVersionEx(&info);
  4861. hasInfo = FALSE;
  4862. p = info.szCSDVersion;
  4863. while (p < info.szCSDVersion + sizeof(info.szCSDVersion) &&
  4864. *p) {
  4865. if (*p != ' ') {
  4866. hasInfo = TRUE;
  4867. break;
  4868. }
  4869. p = p XFORM_OK_PLUS 1;
  4870. }
  4871. sprintf(buff,"Windows %s %ld.%ld (Build %ld)%s%s",
  4872. (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) ?
  4873. "9x" :
  4874. (info.dwPlatformId == VER_PLATFORM_WIN32_NT) ?
  4875. "NT" : "Unknown platform",
  4876. info.dwMajorVersion,info.dwMinorVersion,
  4877. (info.dwPlatformId == VER_PLATFORM_WIN32_NT) ?
  4878. info.dwBuildNumber :
  4879. info.dwBuildNumber & 0xFFFF,
  4880. hasInfo ? " " : "",hasInfo ? info.szCSDVersion : "");
  4881. }
  4882. #endif
  4883. /***************************** OSKit **********************************/
  4884. #ifdef USE_OSKIT_CONSOLE
  4885. void machine_details(char *buff)
  4886. {
  4887. strcpy(buff, "OSKit");
  4888. }
  4889. #endif
  4890. /***************************** Unix ***********************************/
  4891. #if (!defined(MACINTOSH_EVENTS) || defined(OS_X)) && !defined(DOS_FILE_SYSTEM) && !defined(USE_OSKIT_CONSOLE)
  4892. READ_ONLY static char *uname_locations[] = { "/bin/uname",
  4893. "/usr/bin/uname",
  4894. /* The above should cover everything, but
  4895. just in case... */
  4896. "/sbin/uname",
  4897. "/usr/sbin/uname",
  4898. "/usr/local/bin/uname",
  4899. "/usr/local/uname",
  4900. NULL };
  4901. static int try_subproc(Scheme_Object *subprocess_proc, char *prog)
  4902. {
  4903. Scheme_Object *a[5];
  4904. mz_jmp_buf * volatile savebuf, newbuf;
  4905. savebuf = scheme_current_thread->error_buf;
  4906. scheme_current_thread->error_buf = &newbuf;
  4907. if (!scheme_setjmp(newbuf)) {
  4908. a[0] = scheme_false;
  4909. a[1] = scheme_false;
  4910. a[2] = scheme_false;
  4911. a[3] = scheme_make_locale_string(prog);
  4912. a[4] = scheme_make_locale_string("-a");
  4913. _scheme_apply_multi(subprocess_proc, 5, a);
  4914. scheme_current_thread->error_buf = savebuf;
  4915. return 1;
  4916. } else {
  4917. scheme_clear_escape();
  4918. scheme_current_thread->error_buf = savebuf;
  4919. return 0;
  4920. }
  4921. }
  4922. void machine_details(char *buff)
  4923. {
  4924. Scheme_Object *subprocess_proc;
  4925. int i;
  4926. subprocess_proc = scheme_builtin_value("subprocess");
  4927. for (i = 0; uname_locations[i]; i++) {
  4928. if (scheme_file_exists(uname_locations[i])) {
  4929. /* Try running it. */
  4930. if (try_subproc(subprocess_proc, uname_locations[i])) {
  4931. Scheme_Object *sout, *sin, *serr;
  4932. intptr_t c;
  4933. sout = scheme_current_thread->ku.multiple.array[1];
  4934. sin = scheme_current_thread->ku.multiple.array[2];
  4935. serr = scheme_current_thread->ku.multiple.array[3];
  4936. scheme_close_output_port(sin);
  4937. scheme_close_input_port(serr);
  4938. /* Read result: */
  4939. strcpy(buff, "<unknown machine>");
  4940. c = scheme_get_bytes(sout, 1023, buff, 0);
  4941. buff[c] = 0;
  4942. scheme_close_input_port(sout);
  4943. /* Remove trailing whitespace (especially newlines) */
  4944. while (c && portable_isspace(((unsigned char *)buff)[c - 1])) {
  4945. buff[--c] = 0;
  4946. }
  4947. return;
  4948. }
  4949. }
  4950. }
  4951. strcpy(buff, "<unknown machine>");
  4952. }
  4953. #endif
  4954. /**********************************************************************/
  4955. /* Precise GC */
  4956. /**********************************************************************/
  4957. #ifdef MZ_PRECISE_GC
  4958. START_XFORM_SKIP;
  4959. #include "mzmark_string.inc"
  4960. static void register_traversers(void)
  4961. {
  4962. GC_REG_TRAV(scheme_string_converter_type, mark_string_convert);
  4963. }
  4964. END_XFORM_SKIP;
  4965. #endif