/src/racket/src/string.c

http://github.com/gmarceau/PLT · C · 5657 lines · 4630 code · 683 blank · 344 comment · 1049 complexity · 0664fdaee9cf399f842278273762caf3 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. Racket
  3. Copyright (c) 2004-2011 PLT Scheme Inc.
  4. Copyright (c) 1995-2001 Matthew Flatt
  5. This library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Library General Public
  7. License as published by the Free Software Foundation; either
  8. version 2 of the License, or (at your option) any later version.
  9. This library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Library General Public License for more details.
  13. You should have received a copy of the GNU Library General Public
  14. License along with this library; if not, write to the Free
  15. Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  16. Boston, MA 02110-1301 USA.
  17. libscheme
  18. Copyright (c) 1994 Brent Benson
  19. All rights reserved.
  20. */
  21. #include "schpriv.h"
  22. #include "schvers.h"
  23. #include <string.h>
  24. #include <ctype.h>
  25. #ifndef DONT_USE_LOCALE
  26. # include <locale.h>
  27. # ifdef MZ_NO_ICONV
  28. # define USE_ICONV_DLL
  29. # endif
  30. # ifndef USE_ICONV_DLL
  31. # include <iconv.h>
  32. # include <langinfo.h>
  33. # endif
  34. # include <wchar.h>
  35. # include <wctype.h>
  36. # include <errno.h>
  37. # ifdef MACOS_UNICODE_SUPPORT
  38. # include <CoreFoundation/CFString.h>
  39. # include <CoreFoundation/CFLocale.h>
  40. # endif
  41. # ifdef WINDOWS_UNICODE_SUPPORT
  42. # include <windows.h>
  43. # endif
  44. #endif
  45. #ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH
  46. # include "schsys.h"
  47. #endif
  48. #include "schustr.inc"
  49. #ifdef USE_ICONV_DLL
  50. typedef intptr_t iconv_t;
  51. typedef int *(*errno_proc_t)();
  52. typedef size_t (*iconv_proc_t)(iconv_t cd,
  53. char **inbuf, size_t *inbytesleft,
  54. char **outbuf, size_t *outbytesleft);
  55. typedef iconv_t (*iconv_open_proc_t)(const char *tocode, const char *fromcode);
  56. typedef void (*iconv_close_proc_t)(iconv_t cd);
  57. typedef char *(*locale_charset_proc_t)();
  58. static errno_proc_t iconv_errno;
  59. static iconv_proc_t iconv;
  60. static iconv_open_proc_t iconv_open;
  61. static iconv_close_proc_t iconv_close;
  62. static locale_charset_proc_t locale_charset; /* Not used, currently */
  63. #define mzCHK_PROC(x) x
  64. static int get_iconv_errno(void)
  65. {
  66. int *a;
  67. a = iconv_errno();
  68. return *a;
  69. }
  70. # undef HAVE_CODESET
  71. # define HAVE_CODESET 1
  72. # define CODESET 0
  73. # define ICONV_errno get_iconv_errno()
  74. extern wchar_t *scheme_get_dll_path(wchar_t *s);
  75. static int iconv_ready = 0;
  76. static void init_iconv()
  77. {
  78. # ifdef MZ_NO_ICONV
  79. # else
  80. HMODULE m;
  81. m = LoadLibraryW(scheme_get_dll_path(L"iconv.dll"));
  82. if (!m)
  83. m = LoadLibraryW(scheme_get_dll_path(L"libiconv.dll"));
  84. if (!m)
  85. m = LoadLibraryW(scheme_get_dll_path(L"libiconv-2.dll"));
  86. if (!m)
  87. m = LoadLibrary("iconv.dll");
  88. if (!m)
  89. m = LoadLibrary("libiconv.dll");
  90. if (!m)
  91. m = LoadLibrary("libiconv-2.dll");
  92. if (m) {
  93. iconv = (iconv_proc_t)GetProcAddress(m, "libiconv");
  94. iconv_open = (iconv_open_proc_t)GetProcAddress(m, "libiconv_open");
  95. iconv_close = (iconv_close_proc_t)GetProcAddress(m, "libiconv_close");
  96. locale_charset = (locale_charset_proc_t)GetProcAddress(m, "locale_charset");
  97. /* Make sure we have all of them or none: */
  98. if (!iconv || !iconv_open || !iconv_close) {
  99. iconv = NULL;
  100. iconv_open = NULL;
  101. iconv_close = NULL;
  102. }
  103. }
  104. if (iconv) {
  105. iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
  106. if (!iconv_errno) {
  107. /* The iconv.dll distributed with PLT Scheme links to msvcrt.dll.
  108. It's a slighly dangerous assumption that whatever iconv we
  109. found also uses msvcrt.dll. */
  110. m = LoadLibrary("msvcrt.dll");
  111. if (m) {
  112. iconv_errno = (errno_proc_t)GetProcAddress(m, "_errno");
  113. if (!iconv_errno) {
  114. iconv = NULL;
  115. iconv_open = NULL;
  116. iconv_close = NULL;
  117. }
  118. }
  119. }
  120. }
  121. # endif
  122. iconv_ready = 1;
  123. }
  124. #else
  125. # define ICONV_errno errno
  126. # define iconv_ready 1
  127. # define mzCHK_PROC(x) 1
  128. static void init_iconv() { }
  129. #endif
  130. #ifdef MACOS_UNICODE_SUPPORT
  131. # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
  132. #endif
  133. #ifdef WINDOWS_UNICODE_SUPPORT
  134. # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
  135. #endif
  136. #ifndef mzLOCALE_IS_UTF_8
  137. # define mzLOCALE_IS_UTF_8(s) !mzCHK_PROC(iconv_open)
  138. #endif
  139. #define mzICONV_KIND 0
  140. #define mzUTF8_KIND 1
  141. #define mzUTF8_TO_UTF16_KIND 2
  142. #define mzUTF16_TO_UTF8_KIND 3
  143. typedef struct Scheme_Converter {
  144. Scheme_Object so;
  145. short closed;
  146. short kind;
  147. iconv_t cd;
  148. int permissive;
  149. Scheme_Custodian_Reference *mref;
  150. } Scheme_Converter;
  151. /* locals */
  152. /* These two locale variables are only valid when reset_locale()
  153. is called after continuation marks (and hence parameterization)
  154. may have changed. Similarly, setlocale() is only up-to-date
  155. when reset_locale() has been called. */
  156. THREAD_LOCAL_DECL(static int locale_on);
  157. THREAD_LOCAL_DECL(static void *current_locale_name_ptr);
  158. static void reset_locale(void);
  159. #define current_locale_name ((const mzchar *)current_locale_name_ptr)
  160. #ifdef USE_ICONV_DLL
  161. static char *nl_langinfo(int which)
  162. {
  163. int i;
  164. reset_locale();
  165. if (!current_locale_name)
  166. current_locale_name_ptr = "\0\0\0\0";
  167. if ((current_locale_name[0] == 'C')
  168. && !current_locale_name[1])
  169. return "US-ASCII";
  170. for (i = 0; current_locale_name[i]; i++) {
  171. if (current_locale_name[i] == '.') {
  172. if (current_locale_name[i + 1]) {
  173. int len, j = 0;
  174. char *enc;
  175. i++;
  176. len = scheme_char_strlen(current_locale_name) - i;
  177. enc = (char *)scheme_malloc_atomic(len + 1);
  178. while (current_locale_name[i]) {
  179. if (current_locale_name[i] > 127)
  180. return "UTF-8";
  181. enc[j++] = current_locale_name[i++];
  182. }
  183. enc[j] = 0;
  184. return enc;
  185. }
  186. }
  187. }
  188. return "UTF-8";
  189. }
  190. #endif
  191. #ifdef DONT_USE_LOCALE
  192. # define mz_iconv_nl_langinfo() ""
  193. #else
  194. static char *mz_iconv_nl_langinfo(){
  195. char *s;
  196. # if HAVE_CODESET
  197. s = nl_langinfo(CODESET);
  198. # else
  199. s = NULL;
  200. # endif
  201. if (!s)
  202. return "";
  203. else
  204. return s;
  205. }
  206. #endif
  207. READ_ONLY static const char * const STRING_IS_NOT_UTF_8 = "string is not a well-formed UTF-8 encoding: ";
  208. static Scheme_Object *make_string (int argc, Scheme_Object *argv[]);
  209. static Scheme_Object *string (int argc, Scheme_Object *argv[]);
  210. static Scheme_Object *string_p (int argc, Scheme_Object *argv[]);
  211. static Scheme_Object *string_length (int argc, Scheme_Object *argv[]);
  212. static Scheme_Object *string_eq (int argc, Scheme_Object *argv[]);
  213. static Scheme_Object *string_locale_eq (int argc, Scheme_Object *argv[]);
  214. static Scheme_Object *string_ci_eq (int argc, Scheme_Object *argv[]);
  215. static Scheme_Object *string_locale_ci_eq (int argc, Scheme_Object *argv[]);
  216. static Scheme_Object *string_lt (int argc, Scheme_Object *argv[]);
  217. static Scheme_Object *string_locale_lt (int argc, Scheme_Object *argv[]);
  218. static Scheme_Object *string_gt (int argc, Scheme_Object *argv[]);
  219. static Scheme_Object *string_locale_gt (int argc, Scheme_Object *argv[]);
  220. static Scheme_Object *string_lt_eq (int argc, Scheme_Object *argv[]);
  221. static Scheme_Object *string_gt_eq (int argc, Scheme_Object *argv[]);
  222. static Scheme_Object *string_ci_lt (int argc, Scheme_Object *argv[]);
  223. static Scheme_Object *string_locale_ci_lt (int argc, Scheme_Object *argv[]);
  224. static Scheme_Object *string_ci_gt (int argc, Scheme_Object *argv[]);
  225. static Scheme_Object *string_locale_ci_gt (int argc, Scheme_Object *argv[]);
  226. static Scheme_Object *string_ci_lt_eq (int argc, Scheme_Object *argv[]);
  227. static Scheme_Object *string_ci_gt_eq (int argc, Scheme_Object *argv[]);
  228. static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[]);
  229. static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[]);
  230. static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[]);
  231. static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[]);
  232. static Scheme_Object *string_locale_upcase (int argc, Scheme_Object *argv[]);
  233. static Scheme_Object *string_locale_downcase (int argc, Scheme_Object *argv[]);
  234. static Scheme_Object *substring (int argc, Scheme_Object *argv[]);
  235. static Scheme_Object *string_append (int argc, Scheme_Object *argv[]);
  236. static Scheme_Object *string_to_list (int argc, Scheme_Object *argv[]);
  237. static Scheme_Object *list_to_string (int argc, Scheme_Object *argv[]);
  238. static Scheme_Object *string_copy (int argc, Scheme_Object *argv[]);
  239. static Scheme_Object *string_copy_bang (int argc, Scheme_Object *argv[]);
  240. static Scheme_Object *string_fill (int argc, Scheme_Object *argv[]);
  241. static Scheme_Object *string_to_immutable (int argc, Scheme_Object *argv[]);
  242. static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[]);
  243. static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[]);
  244. static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[]);
  245. static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[]);
  246. static Scheme_Object *make_shared_byte_string (int argc, Scheme_Object *argv[]);
  247. static Scheme_Object *shared_byte_string (int argc, Scheme_Object *argv[]);
  248. static Scheme_Object *make_byte_string (int argc, Scheme_Object *argv[]);
  249. static Scheme_Object *byte_string (int argc, Scheme_Object *argv[]);
  250. static Scheme_Object *byte_p (int argc, Scheme_Object *argv[]);
  251. static Scheme_Object *byte_string_p (int argc, Scheme_Object *argv[]);
  252. static Scheme_Object *byte_string_length (int argc, Scheme_Object *argv[]);
  253. static Scheme_Object *byte_string_eq (int argc, Scheme_Object *argv[]);
  254. static Scheme_Object *byte_string_lt (int argc, Scheme_Object *argv[]);
  255. static Scheme_Object *byte_string_gt (int argc, Scheme_Object *argv[]);
  256. static Scheme_Object *byte_substring (int argc, Scheme_Object *argv[]);
  257. static Scheme_Object *byte_string_append (int argc, Scheme_Object *argv[]);
  258. static Scheme_Object *byte_string_to_list (int argc, Scheme_Object *argv[]);
  259. static Scheme_Object *list_to_byte_string (int argc, Scheme_Object *argv[]);
  260. static Scheme_Object *byte_string_copy (int argc, Scheme_Object *argv[]);
  261. static Scheme_Object *byte_string_copy_bang (int argc, Scheme_Object *argv[]);
  262. static Scheme_Object *byte_string_fill (int argc, Scheme_Object *argv[]);
  263. static Scheme_Object *byte_string_to_immutable (int argc, Scheme_Object *argv[]);
  264. static Scheme_Object *byte_string_utf8_index (int argc, Scheme_Object *argv[]);
  265. static Scheme_Object *byte_string_utf8_ref (int argc, Scheme_Object *argv[]);
  266. static Scheme_Object *byte_string_utf8_length (int argc, Scheme_Object *argv[]);
  267. static Scheme_Object *byte_string_to_char_string (int argc, Scheme_Object *argv[]);
  268. static Scheme_Object *byte_string_to_char_string_locale (int argc, Scheme_Object *argv[]);
  269. static Scheme_Object *byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[]);
  270. static Scheme_Object *char_string_to_byte_string (int argc, Scheme_Object *argv[]);
  271. static Scheme_Object *char_string_to_byte_string_locale (int argc, Scheme_Object *argv[]);
  272. static Scheme_Object *char_string_to_byte_string_latin1 (int argc, Scheme_Object *argv[]);
  273. static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[]);
  274. static Scheme_Object *version(int argc, Scheme_Object *argv[]);
  275. static Scheme_Object *format(int argc, Scheme_Object *argv[]);
  276. static Scheme_Object *sch_printf(int argc, Scheme_Object *argv[]);
  277. static Scheme_Object *sch_eprintf(int argc, Scheme_Object *argv[]);
  278. static Scheme_Object *sch_fprintf(int argc, Scheme_Object *argv[]);
  279. static Scheme_Object *banner(int argc, Scheme_Object *argv[]);
  280. static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[]);
  281. static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[]);
  282. static Scheme_Object *system_type(int argc, Scheme_Object *argv[]);
  283. static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[]);
  284. static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[]);
  285. static Scheme_Object *current_locale(int argc, Scheme_Object *argv[]);
  286. static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[]);
  287. static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[]);
  288. static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object *argv[]);
  289. static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object *argv[]);
  290. static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[]);
  291. static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[]);
  292. static Scheme_Object *byte_converter_p(int argc, Scheme_Object *argv[]);
  293. #ifdef MZ_PRECISE_GC
  294. static void register_traversers(void);
  295. #endif
  296. static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
  297. static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
  298. static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2);
  299. XFORM_NONGCING static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
  300. unsigned int *us, intptr_t dstart, intptr_t dend,
  301. intptr_t *ipos, intptr_t *jpos,
  302. char compact, char utf16,
  303. int *state, int might_continue, int permissive);
  304. XFORM_NONGCING static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
  305. unsigned char *s, intptr_t dstart, intptr_t dend,
  306. intptr_t *_ipos, intptr_t *_opos, char utf16);
  307. static char *string_to_from_locale(int to_bytes,
  308. char *in, intptr_t delta, intptr_t len,
  309. intptr_t *olen, int perm,
  310. int *no_cvt);
  311. #define portable_isspace(x) (((x) < 128) && isspace(x))
  312. ROSYM static Scheme_Object *sys_symbol;
  313. ROSYM static Scheme_Object *platform_3m_path, *platform_cgc_path;
  314. READ_ONLY static Scheme_Object *zero_length_char_string;
  315. READ_ONLY static Scheme_Object *zero_length_byte_string;
  316. SHARED_OK static Scheme_Hash_Table *putenv_str_table;
  317. SHARED_OK static char *embedding_banner;
  318. SHARED_OK static Scheme_Object *vers_str;
  319. SHARED_OK static Scheme_Object *banner_str;
  320. READ_ONLY static Scheme_Object *complete_symbol, *continues_symbol, *aborts_symbol, *error_symbol;
  321. void
  322. scheme_init_string (Scheme_Env *env)
  323. {
  324. Scheme_Object *p;
  325. REGISTER_SO(sys_symbol);
  326. sys_symbol = scheme_intern_symbol(SYSTEM_TYPE_NAME);
  327. REGISTER_SO(zero_length_char_string);
  328. REGISTER_SO(zero_length_byte_string);
  329. zero_length_char_string = scheme_alloc_char_string(0, 0);
  330. zero_length_byte_string = scheme_alloc_byte_string(0, 0);
  331. REGISTER_SO(complete_symbol);
  332. REGISTER_SO(continues_symbol);
  333. REGISTER_SO(aborts_symbol);
  334. REGISTER_SO(error_symbol);
  335. complete_symbol = scheme_intern_symbol("complete");
  336. continues_symbol = scheme_intern_symbol("continues");
  337. aborts_symbol = scheme_intern_symbol("aborts");
  338. error_symbol = scheme_intern_symbol("error");
  339. REGISTER_SO(platform_3m_path);
  340. #ifdef UNIX_FILE_SYSTEM
  341. # define MZ3M_SUBDIR "/3m"
  342. #else
  343. # ifdef DOS_FILE_SYSTEM
  344. # define MZ3M_SUBDIR "\\3m"
  345. # else
  346. # define MZ3M_SUBDIR ":3m"
  347. # endif
  348. #endif
  349. REGISTER_SO(platform_3m_path);
  350. REGISTER_SO(platform_cgc_path);
  351. platform_cgc_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH);
  352. platform_3m_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH MZ3M_SUBDIR);
  353. REGISTER_SO(putenv_str_table);
  354. REGISTER_SO(embedding_banner);
  355. REGISTER_SO(vers_str);
  356. REGISTER_SO(banner_str);
  357. vers_str = scheme_make_utf8_string(scheme_version());
  358. SCHEME_SET_CHAR_STRING_IMMUTABLE(vers_str);
  359. banner_str = scheme_make_utf8_string(scheme_banner());
  360. SCHEME_SET_CHAR_STRING_IMMUTABLE(banner_str);
  361. p = scheme_make_folding_prim(string_p, "string?", 1, 1, 1);
  362. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_UNARY_INLINED;
  363. scheme_add_global_constant("string?", p, env);
  364. scheme_add_global_constant("make-string",
  365. scheme_make_immed_prim(make_string,
  366. "make-string",
  367. 1, 2),
  368. env);
  369. scheme_add_global_constant("string",
  370. scheme_make_immed_prim(string,
  371. "string",
  372. 0, -1),
  373. env);
  374. scheme_add_global_constant("string-length",
  375. scheme_make_folding_prim(string_length,
  376. "string-length",
  377. 1, 1, 1),
  378. env);
  379. p = scheme_make_immed_prim(scheme_checked_string_ref, "string-ref", 2, 2);
  380. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
  381. scheme_add_global_constant("string-ref", p, env);
  382. p = scheme_make_immed_prim(scheme_checked_string_set, "string-set!", 3, 3);
  383. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  384. scheme_add_global_constant("string-set!", p, env);
  385. scheme_add_global_constant("string=?",
  386. scheme_make_immed_prim(string_eq,
  387. "string=?",
  388. 2, -1),
  389. env);
  390. scheme_add_global_constant("string-locale=?",
  391. scheme_make_immed_prim(string_locale_eq,
  392. "string-locale=?",
  393. 2, -1),
  394. env);
  395. scheme_add_global_constant("string-ci=?",
  396. scheme_make_immed_prim(string_ci_eq,
  397. "string-ci=?",
  398. 2, -1),
  399. env);
  400. scheme_add_global_constant("string-locale-ci=?",
  401. scheme_make_immed_prim(string_locale_ci_eq,
  402. "string-locale-ci=?",
  403. 2, -1),
  404. env);
  405. scheme_add_global_constant("string<?",
  406. scheme_make_immed_prim(string_lt,
  407. "string<?",
  408. 2, -1),
  409. env);
  410. scheme_add_global_constant("string-locale<?",
  411. scheme_make_immed_prim(string_locale_lt,
  412. "string-locale<?",
  413. 2, -1),
  414. env);
  415. scheme_add_global_constant("string>?",
  416. scheme_make_immed_prim(string_gt,
  417. "string>?",
  418. 2, -1),
  419. env);
  420. scheme_add_global_constant("string-locale>?",
  421. scheme_make_immed_prim(string_locale_gt,
  422. "string-locale>?",
  423. 2, -1),
  424. env);
  425. scheme_add_global_constant("string<=?",
  426. scheme_make_immed_prim(string_lt_eq,
  427. "string<=?",
  428. 2, -1),
  429. env);
  430. scheme_add_global_constant("string>=?",
  431. scheme_make_immed_prim(string_gt_eq,
  432. "string>=?",
  433. 2, -1),
  434. env);
  435. scheme_add_global_constant("string-ci<?",
  436. scheme_make_immed_prim(string_ci_lt,
  437. "string-ci<?",
  438. 2, -1),
  439. env);
  440. scheme_add_global_constant("string-locale-ci<?",
  441. scheme_make_immed_prim(string_locale_ci_lt,
  442. "string-locale-ci<?",
  443. 2, -1),
  444. env);
  445. scheme_add_global_constant("string-ci>?",
  446. scheme_make_immed_prim(string_ci_gt,
  447. "string-ci>?",
  448. 2, -1),
  449. env);
  450. scheme_add_global_constant("string-locale-ci>?",
  451. scheme_make_immed_prim(string_locale_ci_gt,
  452. "string-locale-ci>?",
  453. 2, -1),
  454. env);
  455. scheme_add_global_constant("string-ci<=?",
  456. scheme_make_immed_prim(string_ci_lt_eq,
  457. "string-ci<=?",
  458. 2, -1),
  459. env);
  460. scheme_add_global_constant("string-ci>=?",
  461. scheme_make_immed_prim(string_ci_gt_eq,
  462. "string-ci>=?",
  463. 2, -1),
  464. env);
  465. scheme_add_global_constant("substring",
  466. scheme_make_immed_prim(substring,
  467. "substring",
  468. 2, 3),
  469. env);
  470. scheme_add_global_constant("string-append",
  471. scheme_make_immed_prim(string_append,
  472. "string-append",
  473. 0, -1),
  474. env);
  475. scheme_add_global_constant("string->list",
  476. scheme_make_immed_prim(string_to_list,
  477. "string->list",
  478. 1, 1),
  479. env);
  480. scheme_add_global_constant("list->string",
  481. scheme_make_immed_prim(list_to_string,
  482. "list->string",
  483. 1, 1),
  484. env);
  485. scheme_add_global_constant("string-copy",
  486. scheme_make_immed_prim(string_copy,
  487. "string-copy",
  488. 1, 1),
  489. env);
  490. scheme_add_global_constant("string-copy!",
  491. scheme_make_immed_prim(string_copy_bang,
  492. "string-copy!",
  493. 3, 5),
  494. env);
  495. scheme_add_global_constant("string-fill!",
  496. scheme_make_immed_prim(string_fill,
  497. "string-fill!",
  498. 2, 2),
  499. env);
  500. scheme_add_global_constant("string->immutable-string",
  501. scheme_make_immed_prim(string_to_immutable,
  502. "string->immutable-string",
  503. 1, 1),
  504. env);
  505. scheme_add_global_constant("string-normalize-nfc",
  506. scheme_make_immed_prim(string_normalize_c,
  507. "string-normalize-nfc",
  508. 1, 1),
  509. env);
  510. scheme_add_global_constant("string-normalize-nfkc",
  511. scheme_make_immed_prim(string_normalize_kc,
  512. "string-normalize-nfkc",
  513. 1, 1),
  514. env);
  515. scheme_add_global_constant("string-normalize-nfd",
  516. scheme_make_immed_prim(string_normalize_d,
  517. "string-normalize-nfd",
  518. 1, 1),
  519. env);
  520. scheme_add_global_constant("string-normalize-nfkd",
  521. scheme_make_immed_prim(string_normalize_kd,
  522. "string-normalize-nfkd",
  523. 1, 1),
  524. env);
  525. scheme_add_global_constant("string-upcase",
  526. scheme_make_immed_prim(string_upcase,
  527. "string-upcase",
  528. 1, 1),
  529. env);
  530. scheme_add_global_constant("string-downcase",
  531. scheme_make_immed_prim(string_downcase,
  532. "string-downcase",
  533. 1, 1),
  534. env);
  535. scheme_add_global_constant("string-titlecase",
  536. scheme_make_immed_prim(string_titlecase,
  537. "string-titlecase",
  538. 1, 1),
  539. env);
  540. scheme_add_global_constant("string-foldcase",
  541. scheme_make_immed_prim(string_foldcase,
  542. "string-foldcase",
  543. 1, 1),
  544. env);
  545. scheme_add_global_constant("string-locale-upcase",
  546. scheme_make_immed_prim(string_locale_upcase,
  547. "string-locale-upcase",
  548. 1, 1),
  549. env);
  550. scheme_add_global_constant("string-locale-downcase",
  551. scheme_make_immed_prim(string_locale_downcase,
  552. "string-locale-downcase",
  553. 1, 1),
  554. env);
  555. scheme_add_global_constant("current-locale",
  556. scheme_register_parameter(current_locale,
  557. "current-locale",
  558. MZCONFIG_LOCALE),
  559. env);
  560. scheme_add_global_constant("locale-string-encoding",
  561. scheme_make_immed_prim(locale_string_encoding,
  562. "locale-string-encoding",
  563. 0, 0),
  564. env);
  565. scheme_add_global_constant("system-language+country",
  566. scheme_make_immed_prim(system_language_country,
  567. "system-language+country",
  568. 0, 0),
  569. env);
  570. scheme_add_global_constant("bytes-converter?",
  571. scheme_make_immed_prim(byte_converter_p,
  572. "bytes-converter?",
  573. 1, 1),
  574. env);
  575. scheme_add_global_constant("bytes-convert",
  576. scheme_make_prim_w_arity2(byte_string_convert,
  577. "bytes-convert",
  578. 1, 7,
  579. 3, 3),
  580. env);
  581. scheme_add_global_constant("bytes-convert-end",
  582. scheme_make_prim_w_arity2(byte_string_convert_end,
  583. "bytes-convert-end",
  584. 0, 3,
  585. 2, 2),
  586. env);
  587. scheme_add_global_constant("bytes-open-converter",
  588. scheme_make_immed_prim(byte_string_open_converter,
  589. "bytes-open-converter",
  590. 2, 2),
  591. env);
  592. scheme_add_global_constant("bytes-close-converter",
  593. scheme_make_immed_prim(byte_string_close_converter,
  594. "bytes-close-converter",
  595. 1, 1),
  596. env);
  597. scheme_add_global_constant("format",
  598. scheme_make_noncm_prim(format,
  599. "format",
  600. 1, -1),
  601. env);
  602. scheme_add_global_constant("printf",
  603. scheme_make_noncm_prim(sch_printf,
  604. "printf",
  605. 1, -1),
  606. env);
  607. scheme_add_global_constant("eprintf",
  608. scheme_make_noncm_prim(sch_eprintf,
  609. "eprintf",
  610. 1, -1),
  611. env);
  612. scheme_add_global_constant("fprintf",
  613. scheme_make_noncm_prim(sch_fprintf,
  614. "fprintf",
  615. 2, -1),
  616. env);
  617. scheme_add_global_constant("byte?",
  618. scheme_make_folding_prim(byte_p,
  619. "byte?",
  620. 1, 1, 1),
  621. env);
  622. p = scheme_make_folding_prim(byte_string_p, "bytes?", 1, 1, 1);
  623. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  624. scheme_add_global_constant("bytes?", p, env);
  625. scheme_add_global_constant("make-bytes",
  626. scheme_make_immed_prim(make_byte_string,
  627. "make-bytes",
  628. 1, 2),
  629. env);
  630. scheme_add_global_constant("bytes",
  631. scheme_make_immed_prim(byte_string,
  632. "bytes",
  633. 0, -1),
  634. env);
  635. GLOBAL_PRIM_W_ARITY("make-shared-bytes", make_shared_byte_string, 1, 2, env);
  636. GLOBAL_PRIM_W_ARITY("shared-bytes", shared_byte_string, 0, -1, env);
  637. scheme_add_global_constant("bytes-length",
  638. scheme_make_folding_prim(byte_string_length,
  639. "bytes-length",
  640. 1, 1, 1),
  641. env);
  642. p = scheme_make_immed_prim(scheme_checked_byte_string_ref, "bytes-ref", 2, 2);
  643. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED;
  644. scheme_add_global_constant("bytes-ref", p, env);
  645. p = scheme_make_immed_prim(scheme_checked_byte_string_set, "bytes-set!", 3, 3);
  646. SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_NARY_INLINED;
  647. scheme_add_global_constant("bytes-set!", p, env);
  648. scheme_add_global_constant("bytes=?",
  649. scheme_make_immed_prim(byte_string_eq,
  650. "bytes=?",
  651. 2, -1),
  652. env);
  653. scheme_add_global_constant("bytes<?",
  654. scheme_make_immed_prim(byte_string_lt,
  655. "bytes<?",
  656. 2, -1),
  657. env);
  658. scheme_add_global_constant("bytes>?",
  659. scheme_make_immed_prim(byte_string_gt,
  660. "bytes>?",
  661. 2, -1),
  662. env);
  663. scheme_add_global_constant("subbytes",
  664. scheme_make_immed_prim(byte_substring,
  665. "subbytes",
  666. 2, 3),
  667. env);
  668. scheme_add_global_constant("bytes-append",
  669. scheme_make_immed_prim(byte_string_append,
  670. "bytes-append",
  671. 0, -1),
  672. env);
  673. scheme_add_global_constant("bytes->list",
  674. scheme_make_immed_prim(byte_string_to_list,
  675. "bytes->list",
  676. 1, 1),
  677. env);
  678. scheme_add_global_constant("list->bytes",
  679. scheme_make_immed_prim(list_to_byte_string,
  680. "list->bytes",
  681. 1, 1),
  682. env);
  683. scheme_add_global_constant("bytes-copy",
  684. scheme_make_immed_prim(byte_string_copy,
  685. "bytes-copy",
  686. 1, 1),
  687. env);
  688. scheme_add_global_constant("bytes-copy!",
  689. scheme_make_immed_prim(byte_string_copy_bang,
  690. "bytes-copy!",
  691. 3, 5),
  692. env);
  693. scheme_add_global_constant("bytes-fill!",
  694. scheme_make_immed_prim(byte_string_fill,
  695. "bytes-fill!",
  696. 2, 2),
  697. env);
  698. scheme_add_global_constant("bytes->immutable-bytes",
  699. scheme_make_immed_prim(byte_string_to_immutable,
  700. "bytes->immutable-bytes",
  701. 1, 1),
  702. env);
  703. scheme_add_global_constant("bytes-utf-8-index",
  704. scheme_make_immed_prim(byte_string_utf8_index,
  705. "bytes-utf-8-index",
  706. 2, 4),
  707. env);
  708. scheme_add_global_constant("bytes-utf-8-length",
  709. scheme_make_immed_prim(byte_string_utf8_length,
  710. "bytes-utf-8-length",
  711. 1, 4),
  712. env);
  713. scheme_add_global_constant("bytes-utf-8-ref",
  714. scheme_make_immed_prim(byte_string_utf8_ref,
  715. "bytes-utf-8-ref",
  716. 2, 4),
  717. env);
  718. scheme_add_global_constant("bytes->string/utf-8",
  719. scheme_make_immed_prim(byte_string_to_char_string,
  720. "bytes->string/utf-8",
  721. 1, 4),
  722. env);
  723. scheme_add_global_constant("bytes->string/locale",
  724. scheme_make_immed_prim(byte_string_to_char_string_locale,
  725. "bytes->string/locale",
  726. 1, 4),
  727. env);
  728. scheme_add_global_constant("bytes->string/latin-1",
  729. scheme_make_immed_prim(byte_string_to_char_string_latin1,
  730. "bytes->string/latin-1",
  731. 1, 4),
  732. env);
  733. scheme_add_global_constant("string->bytes/utf-8",
  734. scheme_make_immed_prim(char_string_to_byte_string,
  735. "string->bytes/utf-8",
  736. 1, 4),
  737. env);
  738. scheme_add_global_constant("string->bytes/locale",
  739. scheme_make_immed_prim(char_string_to_byte_string_locale,
  740. "string->bytes/locale",
  741. 1, 4),
  742. env);
  743. scheme_add_global_constant("string->bytes/latin-1",
  744. scheme_make_immed_prim(char_string_to_byte_string_latin1,
  745. "string->bytes/latin-1",
  746. 1, 4),
  747. env);
  748. scheme_add_global_constant("string-utf-8-length",
  749. scheme_make_immed_prim(char_string_utf8_length,
  750. "string-utf-8-length",
  751. 1, 3),
  752. env);
  753. /* In principle, `version' could be foldable, but it invites
  754. more problems than it solves... */
  755. scheme_add_global_constant("version",
  756. scheme_make_immed_prim(version,
  757. "version",
  758. 0, 0),
  759. env);
  760. scheme_add_global_constant("banner",
  761. scheme_make_immed_prim(banner,
  762. "banner",
  763. 0, 0),
  764. env);
  765. scheme_add_global_constant("getenv",
  766. scheme_make_immed_prim(sch_getenv,
  767. "getenv",
  768. 1, 1),
  769. env);
  770. scheme_add_global_constant("putenv",
  771. scheme_make_immed_prim(sch_putenv,
  772. "putenv",
  773. 2, 2),
  774. env);
  775. /* Don't make these folding, since they're platform-specific: */
  776. scheme_add_global_constant("system-type",
  777. scheme_make_immed_prim(system_type,
  778. "system-type",
  779. 0, 1),
  780. env);
  781. scheme_add_global_constant("system-library-subpath",
  782. scheme_make_immed_prim(system_library_subpath,
  783. "system-library-subpath",
  784. 0, 1),
  785. env);
  786. scheme_add_global_constant("current-command-line-arguments",
  787. scheme_register_parameter(cmdline_args,
  788. "current-command-line-arguments",
  789. MZCONFIG_CMDLINE_ARGS),
  790. env);
  791. #ifdef MZ_PRECISE_GC
  792. register_traversers();
  793. #endif
  794. }
  795. void scheme_init_string_places(void) {
  796. REGISTER_SO(current_locale_name_ptr);
  797. current_locale_name_ptr = "xxxx\0\0\0\0";
  798. }
  799. /**********************************************************************/
  800. /* UTF-8 char constructors */
  801. /**********************************************************************/
  802. Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, intptr_t d, intptr_t len)
  803. {
  804. intptr_t ulen;
  805. mzchar *us;
  806. if (len) {
  807. ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
  808. NULL, 0, -1,
  809. NULL, 0 /* not UTF-16 */, 0xFFFD);
  810. us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
  811. scheme_utf8_decode((unsigned char *)chars, d, d + len,
  812. us, 0, -1,
  813. NULL, 0 /* not UTF-16 */, 0xFFFD);
  814. us[ulen] = 0;
  815. } else {
  816. us = (mzchar *)"\0\0\0";
  817. ulen = 0;
  818. }
  819. return scheme_make_sized_offset_char_string(us, 0, ulen, 0);
  820. }
  821. Scheme_Object *
  822. scheme_make_sized_utf8_string(char *chars, intptr_t len)
  823. {
  824. return scheme_make_sized_offset_utf8_string(chars, 0, len);
  825. }
  826. Scheme_Object *
  827. scheme_make_immutable_sized_utf8_string(char *chars, intptr_t len)
  828. {
  829. Scheme_Object *s;
  830. s = scheme_make_sized_offset_utf8_string(chars, 0, len);
  831. if (len)
  832. SCHEME_SET_CHAR_STRING_IMMUTABLE(s);
  833. return s;
  834. }
  835. Scheme_Object *
  836. scheme_make_utf8_string(const char *chars)
  837. {
  838. return scheme_make_sized_offset_utf8_string((char *)chars, 0, -1);
  839. }
  840. Scheme_Object *
  841. scheme_make_locale_string(const char *chars)
  842. {
  843. return scheme_byte_string_to_char_string_locale(scheme_make_byte_string((char *)chars));
  844. }
  845. /**********************************************************************/
  846. /* index helpers */
  847. /**********************************************************************/
  848. void scheme_out_of_string_range(const char *name, const char *which,
  849. Scheme_Object *i, Scheme_Object *s,
  850. intptr_t start, intptr_t len)
  851. {
  852. int is_byte;
  853. is_byte = SCHEME_BYTE_STRINGP(s);
  854. if (len) {
  855. char *sstr;
  856. intptr_t slen;
  857. sstr = scheme_make_provided_string(s, 2, &slen);
  858. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  859. "%s: %sindex %s out of range [%d, %d] for %s%s: %t",
  860. name, which,
  861. scheme_make_provided_string(i, 2, NULL),
  862. ((start < 0) ? 0 : start),
  863. ((start < 0) ? (len - 1) : len),
  864. is_byte ? "byte-" : "",
  865. SCHEME_CHAPERONE_VECTORP(s) ? "vector" : "string",
  866. sstr, slen);
  867. } else {
  868. scheme_raise_exn(MZEXN_FAIL_CONTRACT,
  869. "%s: %sindex %s out of range for empty %s%s",
  870. name, which,
  871. scheme_make_provided_string(i, 0, NULL),
  872. is_byte ? "byte-" : "",
  873. SCHEME_CHAPERONE_VECTORP(s) ? "vector" : "string");
  874. }
  875. }
  876. intptr_t scheme_extract_index(const char *name, int pos, int argc, Scheme_Object **argv, intptr_t top, int false_ok)
  877. {
  878. intptr_t i;
  879. int is_top = 0;
  880. if (SCHEME_INTP(argv[pos])) {
  881. i = SCHEME_INT_VAL(argv[pos]);
  882. } else if (SCHEME_BIGNUMP(argv[pos])) {
  883. if (SCHEME_BIGPOS(argv[pos])) {
  884. i = top; /* out-of-bounds */
  885. is_top = 1;
  886. } else
  887. i = -1; /* negative */
  888. } else
  889. i = -1;
  890. if (!is_top && (i < 0))
  891. scheme_wrong_type(name,
  892. (false_ok ? "non-negative exact integer or #f" : "non-negative exact integer"),
  893. pos, argc, argv);
  894. return i;
  895. }
  896. void scheme_get_substring_indices(const char *name, Scheme_Object *str,
  897. int argc, Scheme_Object **argv,
  898. int spos, int fpos, intptr_t *_start, intptr_t *_finish)
  899. {
  900. intptr_t len;
  901. intptr_t start, finish;
  902. if (SCHEME_CHAPERONE_VECTORP(str))
  903. len = SCHEME_VEC_SIZE(str);
  904. else if (SCHEME_CHAR_STRINGP(str))
  905. len = SCHEME_CHAR_STRTAG_VAL(str);
  906. else
  907. len = SCHEME_BYTE_STRTAG_VAL(str);
  908. if (argc > spos)
  909. start = scheme_extract_index(name, spos, argc, argv, len + 1, 0);
  910. else
  911. start = 0;
  912. if (argc > fpos)
  913. finish = scheme_extract_index(name, fpos, argc, argv, len + 1, 0);
  914. else
  915. finish = len;
  916. if (!(start <= len)) {
  917. scheme_out_of_string_range(name, (fpos < 100) ? "starting " : "", argv[spos], str, 0, len);
  918. }
  919. if (!(finish >= start && finish <= len)) {
  920. scheme_out_of_string_range(name, "ending ", argv[fpos], str, start, len);
  921. }
  922. *_start = start;
  923. *_finish = finish;
  924. }
  925. void scheme_do_get_substring_indices(const char *name, Scheme_Object *str,
  926. int argc, Scheme_Object **argv,
  927. int spos, int fpos, intptr_t *_start, intptr_t *_finish, intptr_t len)
  928. {
  929. if (argc > spos) {
  930. if (SCHEME_INTP(argv[spos])) {
  931. intptr_t start = SCHEME_INT_VAL(argv[spos]);
  932. if ((start >= 0) && (start < len)) {
  933. *_start = start;
  934. if (argc > fpos) {
  935. intptr_t finish = SCHEME_INT_VAL(argv[fpos]);
  936. if ((finish >= start) && (finish <= len)) {
  937. *_finish = finish;
  938. return;
  939. }
  940. } else {
  941. *_finish = len;
  942. return;
  943. }
  944. }
  945. }
  946. } else {
  947. *_start = 0;
  948. *_finish = len;
  949. return;
  950. }
  951. scheme_get_substring_indices(name, str, argc, argv, spos, fpos, _start, _finish);
  952. }
  953. /**********************************************************************/
  954. /* char strings */
  955. /**********************************************************************/
  956. #define SCHEME_X_STR_VAL(x) SCHEME_CHAR_STR_VAL(x)
  957. #define SCHEME_X_STRTAG_VAL(x) SCHEME_CHAR_STRTAG_VAL(x)
  958. #define SCHEME_X_STRINGP(x) SCHEME_CHAR_STRINGP(x)
  959. #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_CHAR_STRINGP(x)
  960. #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_CHAR_STRING_IMMUTABLE(x)
  961. #define scheme_x_string_type scheme_char_string_type
  962. #define X(a, b) a##_char##b
  963. #define X_(a, b) a##_##b
  964. #define X__(a) a
  965. #define EMPTY (mzchar *)"\0\0\0"
  966. #define Xchar mzchar
  967. #define uXchar mzchar
  968. #define XSTR ""
  969. #define XSTRINGSTR "string"
  970. #define SUBXSTR "substring"
  971. #define CHARP(x) SCHEME_CHARP(x)
  972. #define CHAR_VAL(x) SCHEME_CHAR_VAL(x)
  973. #define CHAR_STR "character"
  974. #define MAKE_CHAR(x) _scheme_make_char(x)
  975. #define xstrlen scheme_char_strlen
  976. #include "strops.inc"
  977. #define GEN_STRING_COMP(name, scheme_name, comp, op, ul, size_shortcut) \
  978. static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
  979. { mzchar *s, *prev; int i, sl, pl; int falz = 0;\
  980. if (!SCHEME_CHAR_STRINGP(argv[0])) \
  981. scheme_wrong_type(scheme_name, "string", 0, argc, argv); \
  982. prev = SCHEME_CHAR_STR_VAL(argv[0]); pl = SCHEME_CHAR_STRTAG_VAL(argv[0]); \
  983. for (i = 1; i < argc; i++) { \
  984. if (!SCHEME_CHAR_STRINGP(argv[i])) \
  985. scheme_wrong_type(scheme_name, "string", i, argc, argv); \
  986. s = SCHEME_CHAR_STR_VAL(argv[i]); sl = SCHEME_CHAR_STRTAG_VAL(argv[i]); \
  987. if (!falz) if (!(comp(scheme_name, \
  988. prev, pl, \
  989. s, sl, ul, size_shortcut) op 0)) falz = 1; \
  990. prev = s; pl = sl; \
  991. } \
  992. return falz ? scheme_false : scheme_true; \
  993. }
  994. GEN_STRING_COMP(string_eq, "string=?", mz_char_strcmp, ==, 0, 1)
  995. GEN_STRING_COMP(string_lt, "string<?", mz_char_strcmp, <, 0, 0)
  996. GEN_STRING_COMP(string_gt, "string>?", mz_char_strcmp, >, 0, 0)
  997. GEN_STRING_COMP(string_lt_eq, "string<=?", mz_char_strcmp, <=, 0, 0)
  998. GEN_STRING_COMP(string_gt_eq, "string>=?", mz_char_strcmp, >=, 0, 0)
  999. GEN_STRING_COMP(string_ci_eq, "string-ci=?", mz_char_strcmp_ci, ==, 0, 0)
  1000. GEN_STRING_COMP(string_ci_lt, "string-ci<?", mz_char_strcmp_ci, <, 0, 0)
  1001. GEN_STRING_COMP(string_ci_gt, "string-ci>?", mz_char_strcmp_ci, >, 0, 0)
  1002. GEN_STRING_COMP(string_ci_lt_eq, "string-ci<=?", mz_char_strcmp_ci, <=, 0, 0)
  1003. GEN_STRING_COMP(string_ci_gt_eq, "string-ci>=?", mz_char_strcmp_ci, >=, 0, 0)
  1004. GEN_STRING_COMP(string_locale_eq, "string-locale=?", mz_char_strcmp, ==, 1, 0)
  1005. GEN_STRING_COMP(string_locale_lt, "string-locale<?", mz_char_strcmp, <, 1, 0)
  1006. GEN_STRING_COMP(string_locale_gt, "string-locale>?", mz_char_strcmp, >, 1, 0)
  1007. GEN_STRING_COMP(string_locale_ci_eq, "string-locale-ci=?", mz_char_strcmp_ci, ==, 1, 0)
  1008. GEN_STRING_COMP(string_locale_ci_lt, "string-locale-ci<?", mz_char_strcmp_ci, <, 1, 0)
  1009. GEN_STRING_COMP(string_locale_ci_gt, "string-locale-ci>?", mz_char_strcmp_ci, >, 1, 0)
  1010. /**********************************************************************/
  1011. /* byte strings */
  1012. /**********************************************************************/
  1013. #define SCHEME_BYTEP(x) ((SCHEME_INTP(x)) && (SCHEME_INT_VAL(x) >= 0) && (SCHEME_INT_VAL(x) <= 255))
  1014. #define BYTE_STR "exact integer in [0,255]"
  1015. static Scheme_Object *
  1016. byte_p(int argc, Scheme_Object *argv[])
  1017. {
  1018. return (SCHEME_BYTEP(argv[0]) ? scheme_true : scheme_false);
  1019. }
  1020. #define SCHEME_X_STR_VAL(x) SCHEME_BYTE_STR_VAL(x)
  1021. #define SCHEME_X_STRTAG_VAL(x) SCHEME_BYTE_STRTAG_VAL(x)
  1022. #define SCHEME_X_STRINGP(x) SCHEME_BYTE_STRINGP(x)
  1023. #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_BYTE_STRINGP(x)
  1024. #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_BYTE_STRING_IMMUTABLE(x)
  1025. #define scheme_x_string_type scheme_byte_string_type
  1026. #define X(a, b) a##_byte##b
  1027. #define X_(a, b) a##_byte_##b
  1028. #define X__(a) byte_##a
  1029. #define EMPTY ""
  1030. #define Xchar char
  1031. #define uXchar unsigned char
  1032. #define XSTR "byte "
  1033. #define XSTRINGSTR "bytes"
  1034. #define SUBXSTR "subbytes"
  1035. #define CHARP(x) SCHEME_BYTEP(x)
  1036. #define CHAR_VAL(x) SCHEME_INT_VAL(x)
  1037. #define CHAR_STR BYTE_STR
  1038. #define MAKE_CHAR(x) scheme_make_integer_value(x)
  1039. #define xstrlen strlen
  1040. #define GENERATING_BYTE
  1041. #include "strops.inc"
  1042. #undef GENERATING_BYTE
  1043. /* comparisons */
  1044. #define GEN_BYTE_STRING_COMP(name, scheme_name, comp, op) \
  1045. static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
  1046. { char *s, *prev; int i, sl, pl; int falz = 0;\
  1047. if (!SCHEME_BYTE_STRINGP(argv[0])) \
  1048. scheme_wrong_type(scheme_name, "byte string", 0, argc, argv); \
  1049. prev = SCHEME_BYTE_STR_VAL(argv[0]); pl = SCHEME_BYTE_STRTAG_VAL(argv[0]); \
  1050. for (i = 1; i < argc; i++) { \
  1051. if (!SCHEME_BYTE_STRINGP(argv[i])) \
  1052. scheme_wrong_type(scheme_name, "byte string", i, argc, argv); \
  1053. s = SCHEME_BYTE_STR_VAL(argv[i]); sl = SCHEME_BYTE_STRTAG_VAL(argv[i]); \
  1054. if (!falz) if (!(comp(scheme_name, \
  1055. (unsigned char *)prev, pl, \
  1056. (unsigned char *)s, sl) op 0)) falz = 1; \
  1057. prev = s; pl = sl; \
  1058. } \
  1059. return falz ? scheme_false : scheme_true; \
  1060. }
  1061. GEN_BYTE_STRING_COMP(byte_string_eq, "bytes=?", mz_strcmp, ==)
  1062. GEN_BYTE_STRING_COMP(byte_string_lt, "bytes<?", mz_strcmp, <)
  1063. GEN_BYTE_STRING_COMP(byte_string_gt, "bytes>?", mz_strcmp, >)
  1064. /**********************************************************************/
  1065. /* byte string <-> char string */
  1066. /**********************************************************************/
  1067. /************************* bytes->string *************************/
  1068. static Scheme_Object *
  1069. do_byte_string_to_char_string(const char *who,
  1070. Scheme_Object *bstr,
  1071. intptr_t istart, intptr_t ifinish,
  1072. int perm, int as_locale)
  1073. {
  1074. int i, ulen;
  1075. char *chars;
  1076. unsigned int *v;
  1077. chars = SCHEME_BYTE_STR_VAL(bstr);
  1078. ulen = utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1079. NULL, 0, -1,
  1080. NULL, NULL, 0, 0,
  1081. NULL, 0,
  1082. (perm > -1) ? 0xD800 : 0);
  1083. if (ulen < 0) {
  1084. scheme_arg_mismatch(who,
  1085. STRING_IS_NOT_UTF_8,
  1086. bstr);
  1087. }
  1088. v = (unsigned int *)scheme_malloc_atomic((ulen + 1) * sizeof(unsigned int));
  1089. utf8_decode_x((unsigned char *)chars, istart, ifinish,
  1090. v, 0, -1,
  1091. NULL, NULL, 0, 0,
  1092. NULL, 0,
  1093. (perm > -1) ? 0xD800 : 0);
  1094. if (perm > -1) {
  1095. for (i = 0; i < ulen; i++) {
  1096. if (v[i] == 0xD800)
  1097. v[i] = perm;
  1098. }
  1099. }
  1100. v[ulen] = 0;
  1101. return scheme_make_sized_char_string(v, ulen, 0);
  1102. }
  1103. static Scheme_Object *
  1104. do_byte_string_to_char_string_locale(const char *who,
  1105. Scheme_Object *bstr,
  1106. intptr_t istart, intptr_t ifinish,
  1107. int perm)
  1108. {
  1109. char *us;
  1110. intptr_t olen;
  1111. reset_locale();
  1112. if (!iconv_ready) init_iconv();
  1113. if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
  1114. return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
  1115. if (istart < ifinish) {
  1116. int no_cvt;
  1117. us = string_to_from_locale(0, SCHEME_BYTE_STR_VAL(bstr),
  1118. istart, ifinish - istart,
  1119. &olen, perm, &no_cvt);
  1120. if (!us) {
  1121. if (no_cvt) {
  1122. return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
  1123. } else {
  1124. scheme_arg_mismatch(who,
  1125. "byte string is not a valid encoding for the current locale: ",
  1126. bstr);
  1127. }
  1128. }
  1129. ((mzchar *)us)[olen] = 0;
  1130. } else {
  1131. us = "\0\0\0";
  1132. olen = 0;
  1133. }
  1134. return scheme_make_sized_char_string((mzchar *)us, olen, 0);
  1135. }
  1136. static Scheme_Object *
  1137. do_string_to_vector(const char *who, int mode, int argc, Scheme_Object *argv[])
  1138. {
  1139. int permc;
  1140. intptr_t istart, ifinish;
  1141. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1142. scheme_wrong_type(who, "byte string", 0, argc, argv);
  1143. if ((argc < 2) || SCHEME_FALSEP(argv[1]))
  1144. permc = -1;
  1145. else {
  1146. if (!SCHEME_CHARP(argv[1]))
  1147. scheme_wrong_type(who, "character or #f", 1, argc, argv);
  1148. permc = SCHEME_CHAR_VAL(argv[1]);
  1149. }
  1150. scheme_get_substring_indices(who, argv[0], argc, argv,
  1151. 2, 3,
  1152. &istart, &ifinish);
  1153. if (mode == 0)
  1154. return do_byte_string_to_char_string(who, argv[0], istart, ifinish, permc, 0);
  1155. else if (mode == 1)
  1156. return do_byte_string_to_char_string_locale(who, argv[0], istart, ifinish, permc);
  1157. else {
  1158. /* Latin-1 */
  1159. mzchar *us;
  1160. unsigned char *s;
  1161. intptr_t i, len;
  1162. len = ifinish - istart;
  1163. s = (unsigned char *)SCHEME_BYTE_STR_VAL(argv[0]);
  1164. us = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
  1165. for (i = istart; i < ifinish; i++) {
  1166. us[i - istart] = s[i];
  1167. }
  1168. us[len] = 0;
  1169. return scheme_make_sized_char_string(us, len, 0);
  1170. }
  1171. }
  1172. static Scheme_Object *
  1173. byte_string_to_char_string (int argc, Scheme_Object *argv[])
  1174. {
  1175. return do_string_to_vector("bytes->string/utf-8", 0, argc, argv);
  1176. }
  1177. static Scheme_Object *
  1178. byte_string_to_char_string_locale (int argc, Scheme_Object *argv[])
  1179. {
  1180. return do_string_to_vector("bytes->string/locale", 1, argc, argv);
  1181. }
  1182. static Scheme_Object *
  1183. byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
  1184. {
  1185. return do_string_to_vector("bytes->string/latin-1", 2, argc, argv);
  1186. }
  1187. Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
  1188. {
  1189. return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
  1190. }
  1191. Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
  1192. {
  1193. return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
  1194. }
  1195. /************************* string->bytes *************************/
  1196. static Scheme_Object *do_char_string_to_byte_string(Scheme_Object *s, intptr_t istart, intptr_t ifinish,
  1197. int as_locale)
  1198. {
  1199. char *bs;
  1200. int slen;
  1201. slen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
  1202. NULL, 0,
  1203. 0 /* UTF-16 */);
  1204. bs = (char *)scheme_malloc_atomic(slen + 1);
  1205. scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
  1206. (unsigned char *)bs, 0,
  1207. 0 /* UTF-16 */);
  1208. bs[slen] = 0;
  1209. return scheme_make_sized_byte_string(bs, slen, 0);
  1210. }
  1211. static Scheme_Object *
  1212. do_char_string_to_byte_string_locale(const char *who,
  1213. Scheme_Object *cstr,
  1214. intptr_t istart, intptr_t ifinish,
  1215. int perm)
  1216. {
  1217. char *s;
  1218. intptr_t olen;
  1219. reset_locale();
  1220. if (!iconv_ready) init_iconv();
  1221. if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on || !mzCHK_PROC(iconv_open))
  1222. return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
  1223. if (istart < ifinish) {
  1224. int no_cvt;
  1225. s = string_to_from_locale(1, (char *)SCHEME_CHAR_STR_VAL(cstr),
  1226. istart, ifinish - istart,
  1227. &olen, perm, &no_cvt);
  1228. if (!s) {
  1229. if (no_cvt) {
  1230. return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
  1231. } else {
  1232. scheme_arg_mismatch(who,
  1233. "string cannot be encoded for the current locale: ",
  1234. cstr);
  1235. }
  1236. }
  1237. s[olen] = 0;
  1238. } else {
  1239. s = "";
  1240. olen = 0;
  1241. }
  1242. return scheme_make_sized_byte_string(s, olen, 0);
  1243. }
  1244. Scheme_Object *scheme_char_string_to_byte_string(Scheme_Object *s)
  1245. {
  1246. return do_char_string_to_byte_string(s, 0, SCHEME_CHAR_STRLEN_VAL(s), 0);
  1247. }
  1248. Scheme_Object *scheme_char_string_to_byte_string_locale(Scheme_Object *s)
  1249. {
  1250. return do_char_string_to_byte_string_locale("s->s", s, 0, SCHEME_CHAR_STRLEN_VAL(s), '?');
  1251. }
  1252. static Scheme_Object *do_chars_to_bytes(const char *who, int mode,
  1253. int argc, Scheme_Object *argv[])
  1254. {
  1255. intptr_t istart, ifinish;
  1256. int permc;
  1257. if (!SCHEME_CHAR_STRINGP(argv[0]))
  1258. scheme_wrong_type(who, "string", 0, argc, argv);
  1259. if ((argc < 2) || SCHEME_FALSEP(argv[1]))
  1260. permc = -1;
  1261. else {
  1262. if (!SCHEME_BYTEP(argv[1]))
  1263. scheme_wrong_type(who, "byte or #f", 1, argc, argv);
  1264. permc = SCHEME_INT_VAL(argv[1]);
  1265. }
  1266. scheme_get_substring_indices(who, argv[0], argc, argv,
  1267. 2, 3, &istart, &ifinish);
  1268. if (mode == 1)
  1269. return do_char_string_to_byte_string_locale(who, argv[0], istart, ifinish, permc);
  1270. else if (mode == 0)
  1271. return do_char_string_to_byte_string(argv[0], istart, ifinish, 0);
  1272. else {
  1273. /* Latin-1 */
  1274. mzchar *us;
  1275. unsigned char *s;
  1276. intptr_t i, len;
  1277. len = ifinish - istart;
  1278. us = SCHEME_CHAR_STR_VAL(argv[0]);
  1279. s = (unsigned char *)scheme_malloc_atomic(len + 1);
  1280. for (i = istart; i < ifinish; i++) {
  1281. if (us[i] < 256)
  1282. s[i - istart] = us[i];
  1283. else if (permc >= 0) {
  1284. s[i - istart] = permc;
  1285. } else {
  1286. scheme_arg_mismatch(who,
  1287. "string cannot be encoded in Latin-1: ",
  1288. argv[0]);
  1289. }
  1290. }
  1291. s[len] = 0;
  1292. return scheme_make_sized_byte_string((char *)s, len, 0);
  1293. }
  1294. }
  1295. static Scheme_Object *char_string_to_byte_string(int argc, Scheme_Object *argv[])
  1296. {
  1297. return do_chars_to_bytes("string->bytes/utf-8", 0, argc, argv);
  1298. }
  1299. static Scheme_Object *char_string_to_byte_string_locale(int argc, Scheme_Object *argv[])
  1300. {
  1301. return do_chars_to_bytes("string->bytes/locale", 1, argc, argv);
  1302. }
  1303. static Scheme_Object *char_string_to_byte_string_latin1(int argc, Scheme_Object *argv[])
  1304. {
  1305. return do_chars_to_bytes("string->bytes/latin-1", 2, argc, argv);
  1306. }
  1307. /************************* Other *************************/
  1308. static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[])
  1309. {
  1310. intptr_t istart, ifinish, len;
  1311. if (!SCHEME_CHAR_STRINGP(argv[0]))
  1312. scheme_wrong_type("string-utf-8-length", "string", 0, argc, argv);
  1313. scheme_get_substring_indices("string-utf-8-length", argv[0], argc, argv,
  1314. 1, 2, &istart, &ifinish);
  1315. len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[0]), istart, ifinish,
  1316. NULL, 0, 0);
  1317. return scheme_make_integer(len);
  1318. }
  1319. static Scheme_Object *
  1320. byte_string_utf8_length (int argc, Scheme_Object *argv[])
  1321. {
  1322. int len, perm;
  1323. intptr_t istart, ifinish;
  1324. char *chars;
  1325. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1326. scheme_wrong_type("bytes-utf-8-length", "string", 0, argc, argv);
  1327. chars = SCHEME_BYTE_STR_VAL(argv[0]);
  1328. if ((argc > 1) && !SCHEME_FALSEP(argv[1])) {
  1329. if (!SCHEME_CHARP(argv[1]))
  1330. scheme_wrong_type("bytes-utf-8-length", "character or #f", 1, argc, argv);
  1331. perm = 1;
  1332. } else
  1333. perm = 0;
  1334. scheme_get_substring_indices("bytes-utf-8-length", argv[0], argc, argv,
  1335. 2, 3,
  1336. &istart, &ifinish);
  1337. len = scheme_utf8_decode((unsigned char *)chars, istart, ifinish,
  1338. NULL, 0, -1,
  1339. NULL, 0, perm);
  1340. if (len < 0)
  1341. return scheme_false;
  1342. else
  1343. return scheme_make_integer(len);
  1344. }
  1345. static Scheme_Object *
  1346. byte_string_utf8_index(int argc, Scheme_Object *argv[])
  1347. {
  1348. intptr_t istart, ifinish, pos = -1, opos, ipos;
  1349. int result, perm;
  1350. char *chars;
  1351. if (!SCHEME_BYTE_STRINGP(argv[0]))
  1352. scheme_wrong_type("bytes-utf-8-index", "byte string", 0, argc, argv);
  1353. chars = SCHEME_BYTE_STR_VAL(argv[0]);
  1354. if (SCHEME_INTP(argv[1])) {